In [8]:
import import_ipynb
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utilities.UtilityFunctions import retrieve_reviews_df, preprocessing, vectorize, text_preprocess

importing Jupyter notebook from c:\Users\Nick\Documents\GitHub\Group027-Sp22\utilities\UtilityFunctions.ipynb


In [9]:
df = preprocessing(retrieve_reviews_df())

Concatenating ../data\Reviews-1.csv
Concatenating ../data\Reviews-2.csv
Concatenating ../data\Reviews-3.csv
Concatenating ../data\Reviews-4.csv


In [12]:
X = df['Text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [13]:
X_train_counts, X_train_tfidf = vectorize(X_train)

X_train_tfidf.shape

(454741, 109178)

In [15]:
clf_logReg_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), 
                            ("clf_logReg", LogisticRegression(multi_class='multinomial',solver ='newton-cg'))])
                            
clf_logReg_pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf_logReg',
                 LogisticRegression(multi_class='multinomial',
                                    solver='newton-cg'))])

In [16]:
predictedLogReg = clf_logReg_pipe.predict(X_test)
np.mean(predictedLogReg == y_test)

0.8041359534155481

In [11]:
#testing change
df

Unnamed: 0,Id,Score,Summary,Text,Sentiment
0,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,1
1,2,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,-1
2,3,4,"""Delight"" says it all",This is a confection that has been around a fe...,0
3,4,2,Cough Medicine,If you are looking for the secret ingredient i...,-1
4,5,5,Great taffy,Great taffy at a great price. There was a wid...,1
...,...,...,...,...,...
568449,568450,5,Will not do without,Great for sesame chicken..this is a good if no...,1
568450,568451,2,disappointed,I'm disappointed with the flavor. The chocolat...,-1
568451,568452,5,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o...",1
568452,568453,5,Favorite Training and reward treat,These are the BEST treats for training and rew...,1


In [12]:
from sklearn.model_selection import StratifiedShuffleSplit

data = df

print("Before {}".format(len(data)))
dataAfter = data.dropna(subset=["Score"]) # removes all NAN in reviews.rating
print("After {}".format(len(dataAfter)))
dataAfter["Score"] = dataAfter["Score"].astype(int)

split = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
for train_index, test_index in split.split(dataAfter, dataAfter["Score"]): 
    strat_train = dataAfter.reindex(train_index)
    strat_test = dataAfter.reindex(test_index)

Before 568427
After 568427


In [14]:
def sentiments(rating):
    if (rating == 5) or (rating == 4):
        return "Positive"
    elif rating == 3:
        return "Neutral"
    elif (rating == 2) or (rating == 1):
        return "Negative"
# Add sentiments to the data
strat_train["Sentiment"] = strat_train["Score"].apply(sentiments)
strat_test["Sentiment"] = strat_test["Score"].apply(sentiments)
strat_train["Sentiment"][:20]

219709    Positive
97496     Positive
13300     Positive
448737    Positive
346732    Positive
437415    Positive
129998    Positive
206343    Positive
172525    Positive
57870     Positive
409930     Neutral
178533    Positive
185295    Negative
28743     Negative
504917    Positive
316060    Positive
6085      Positive
274524     Neutral
348725    Positive
135587    Negative
Name: Sentiment, dtype: object

In [20]:
# Prepare data
X_train = strat_train["Text"]
X_train_targetSentiment = strat_train["Sentiment"]
X_test = strat_test["Text"]
X_test_targetSentiment = strat_test["Sentiment"]
print(len(X_train), len(X_test))

# Replace "nan" with space
X_train = X_train.fillna(' ')
X_test = X_test.fillna(' ')
X_train_targetSentiment = X_train_targetSentiment.fillna(' ')
X_test_targetSentiment = X_test_targetSentiment.fillna(' ')

# Text preprocessing and occurance counting
from sklearn.feature_extraction.text import CountVectorizer 
count_vect = CountVectorizer()
#X_train_counts = count_vect.fit_transform(X_train) 

X_train_counts, X_train_tfidf = vectorize(X_train)
X_train_counts.shape

454741 113686


(454741, 109473)

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(454741, 109473)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
clf_logReg_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf_logReg", LogisticRegression(multi_class='multinomial',solver ='newton-cg'))])
clf_logReg_pipe.fit(X_train, X_train_targetSentiment)

import numpy as np
predictedLogReg = clf_logReg_pipe.predict(X_test)
np.mean(predictedLogReg == X_test_targetSentiment)

#clf_logReg_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), 
#                            ("clf_logReg", LogisticRegression(multi_class='multinomial',solver ='newton-cg'))])
                            
#clf_logReg_pipe.fit(X_train, y_train)

0.8801875340851116