In [6]:
import numpy as np
import pandas as pd
import glob
import string
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from textwrap import wrap

In [3]:
#reviews_df = pd.read_csv('Reviews.csv')
'''
Merge datasets in the data folder.
Returns a dataframe of all the data.
'''
def retrieve_reviews_df():
    all_files = glob.glob("../data/*.csv")  

    df_list = []

    for filename in all_files:
        print(f"Concatenating {filename}")
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)
    
    return pd.concat(df_list, axis=0, ignore_index=True)

reviews_df = retrieve_reviews_df()

Concatenating ../data\Reviews-1.csv
Concatenating ../data\Reviews-2.csv
Concatenating ../data\Reviews-3.csv
Concatenating ../data\Reviews-4.csv


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
# preprocessing for the dataframe 
def score(x):
    if x<3:
        return -1
    elif x==3 or x==4:
        return 0
    else:
        return 1

def preprocessing(df):
    # drop columns
    new_df = df.drop(columns = ["ProductId", "UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Time"])

    # drop na values
    new_df = new_df.dropna(axis=0)

    # make a new column of sentiment: (-1/0/1) -- pos/neutral/neg -- 1,2/3/4,5
    new_df['Sentiment'] = new_df.apply(lambda x: score(x['Score']), axis=1)


    return new_df

df = preprocessing(reviews_df)

In [7]:
X = df['Text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train) 

tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(454741, 109178)

In [None]:
#split data into train and test

In [9]:
clf_logReg_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), 
                            ("clf_logReg", LogisticRegression(multi_class='multinomial',solver ='newton-cg'))])
                            
clf_logReg_pipe.fit(X_train, y_train)


predictedLogReg = clf_logReg_pipe.predict(X_test)
np.mean(predictedLogReg == y_train)