In [1]:
#importing necessary libraries
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import heapq
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Step 1: Sentence Segmentation
#seperating sentences using spacy (punctuation marks work like delimitters. Sentences just maintains them as seperate sentences
def process_text(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences[0]

In [3]:
# step 2: Tokenization
def tokenize_text(text):
    text = process_text(text)
    tokenized_sents = nltk.sent_tokenize(text)
    tokenized_sents = [nltk.word_tokenize(sent) for sent in tokenized_sents]
    tokenized_sents = [remove_stop_words(tokens) for tokens in tokenized_sents]

    return tokenized_sents

In [4]:
# Step 3: removing stop words according to Buckely list of stop words

#for which we ned to first take the buckley stopword list as a list
with open("./Data/Buckley-Salton-stopword-list.txt", "r") as words:
	lines = words.readlines()
stop_words =[]
for words in lines:
    as_list = words.split(",")
    stop_words.append(as_list[0].replace("\n", ""))

In [5]:
def remove_stop_words(text):
    return [word for word in text if word.lower() not in stop_words]

In [6]:
# Step 4:  Extracting sentences 
def F_first_K_Sents(text, f):
    return text[:f]

# selecting only important parts of sentences and making a score
def SelectImpSentences(scores, N, sentences):
    selected_indices = heapq.nlargest(N, scores, key=scores.get)
    selected_sentences = [sentences[i] for i in selected_indices]
    return [sentence for sublist in selected_sentences for sentence in sublist]

In [7]:
# Step 5: combining all fucntions and finding the score of best sentences
def extractiveApproach(dataset, f, N):
    output_dataset = []
    for input_text in dataset:
        important_sentences = []

        #calling the tokenization function
        tokenized_sents = tokenize_text(input_text)

        # calling the Sentence Extracting fucntion
        first_k_sents = F_first_K_Sents(tokenized_sents, f)
        
        #finding the best score words and sentences
        sentence_embeddings = [np.mean(np.array([token.vector for token in nlp(' '.join(tokens))]), axis=0) for tokens in first_k_sents]
        similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)
        
        #findng the important sentence
        for i, sent in enumerate(first_k_sents):
            scores = {j: similarity_matrix[i][j] for j in range(len(first_k_sents))}
            important_sentences.append(sent)
        
        selected_sentences = SelectImpSentences(scores, N, first_k_sents)
        summary = ' '.join(selected_sentences)
        
        output_dataset.append(summary)
    
    return output_dataset

In [11]:
with open('Data/2007/2007_acura_rdx', "r") as words:
	lines = words.readlines()
text =[]
for words in lines:
    as_list = words.split(",")
    text.append(as_list[0].replace("\n", ""))

f = 2  # Number of sentences to consider
N = 5  # Number of words per sentence

summarized_dataset = extractiveApproach(text, f, N)

In [12]:
summarized_dataset

['< DOCNO > 2007_acura_rdx < /DOCNO >',
 '< DOC >',
 '< DATE > 08/28/2009 < /DATE >',
 '< AUTHOR > RDX LOVER < /AUTHOR >',
 "< TEXT > 'm sales rep drive approximately 40",
 '< FAVORITE > Blue Tooth Technology Backup camera side mirror tilts',
 '< /DOC >',
 '< DOC >',
 '< DATE > 07/27/2009 < /DATE >',
 '< AUTHOR > Critical < /AUTHOR >',
 '< TEXT > Absolutely great car .',
 '< FAVORITE > Tech package',
 '< /DOC >',
 '< DOC >',
 '< DATE > 05/25/2009 < /DATE >',
 '< AUTHOR > rcizme < /AUTHOR >',
 '< TEXT >',
 '< FAVORITE > Premium sound system',
 '< /DOC >',
 '< DOC >',
 '< DATE > 05/21/2009 < /DATE >',
 '< AUTHOR > Chapel Hill < /AUTHOR >',
 '< TEXT >',
 '< FAVORITE > * turbo charge engine ; all-wheel drive',
 '< /DOC >',
 '< DOC >',
 '< DATE > 05/08/2009 < /DATE >',
 '< AUTHOR > ever_green < /AUTHOR >',
 '< TEXT >',
 '< FAVORITE > Turbo charged',
 '< /DOC >',
 '< DOC >',
 '< DATE > 05/07/2009 < /DATE >',
 '< AUTHOR >',
 '< TEXT >',
 '< FAVORITE > comfortable seats',
 '< /DOC >',
 '< DOC 

In [20]:
from collections import defaultdict
from pathlib import Path
import pandas as pd

my_dir_path = "./Data/2007"

results = defaultdict(list)
for file in Path(my_dir_path).iterdir():
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["text"].append(file_open.read())
df = pd.DataFrame(results)

In [21]:
df.head()

Unnamed: 0,file_name,text
0,2007_acura_mdx,\nFlewByU\nI just moved to Germany two months ...
1,2007_acura_rdx,<DOCNO>2007_acura_rdx</DOCNO>\n<DOC>\n<DATE>08...
2,2007_acura_rl,<DOCNO>2007_acura_rl</DOCNO>\n<DOC>\n<DATE>09/...
3,2007_acura_tl,<DOCNO>2007_acura_tl</DOCNO>\n<DOC>\n<DATE>09/...
4,2007_acura_tsx,<DOCNO>2007_acura_tsx</DOCNO>\n<DOC>\n<DATE>09...


In [22]:
df.to_csv('2007-car-reviews.csv', index = False)