In [97]:
import pandas as pd
import spacy
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from spacy.lang.en.stop_words import STOP_WORDS

# File I/O and Grabbing Data

In [83]:
filepaths = {'amazon': 'amazon_cells_labelled.txt',
               'yelp' : 'yelp_labelled.txt',
               'imdb': 'imdb_labelled.txt'}

dfs = []
for source, path in filepaths.items():
    df = pd.read_csv(path, names = ['Sentence', 'Score'], sep = '\t')
    dfs.append(df)
df = pd.concat(dfs)

# Tokenize/Lemmatize

In [84]:
punct = string.punctuation
nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)

In [93]:
def clean_data(sentence): 
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        #if root word is not a pronoun, lowercase the word then strip any spaces
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

# Vectorization with TF-IDF (Bag of Words)

In [110]:
tfidf = TfidfVectorizer(tokenizer = clean_data)
classifier = LinearSVC() #idk what this means btw

In [112]:
#split data
X = df['Sentence']
y = df['Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [115]:
#create pipeline that will first vectorize the data then classify it
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [116]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function clean_data at 0x7fb5686a8700>)),
                ('clf', LinearSVC())])

In [117]:
y_pred = clf.predict(X_test)

In [118]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.73      0.78       486
           1       0.72      0.83      0.77       421

    accuracy                           0.77       907
   macro avg       0.78      0.78      0.77       907
weighted avg       0.78      0.77      0.77       907



In [119]:
confusion_matrix(y_test,y_pred)

array([[354, 132],
       [ 73, 348]])

Accuracy is 78%. :^ ) this sucks.