***** Importing libraries *****

In [15]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pickle


from nltk.corpus import stopwords
import nltk
nltk.download('punkt')

import re
import spacy

import gensim
from gensim.utils import simple_preprocess


%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abinl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# import the csv Appliances.csv file as a dataframe
df=pd.read_csv('./df.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'asin', 'title', 'tech1', 'reviewText', 'overall',
       'reviewerID', 'reviewTime', 'year', 'month'],
      dtype='object')

In [4]:
# describing our own stopwords to avoid some of the negative words like 'not' from being stopped
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
              "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
              'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
              'themselves',  'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
              'are', 'was', 'were', 'be', 'been', 'being',  'a', 'an', 'the', 'and',  'or', 'as','for','of','all','would',
               'at', 'by', 'through', 'to', 'from', 'in', 'out','on', 'off',  'then', 'here','there', 'when', 'one',
              'where', 'why', 'how','so',   's', 't',   'd', 'll', 'm', 'o','re', 've','y', 'ain','if',
               'ma','what', 'aa','will','name','have','with','go']

In [5]:
# function to do initial cleaning by setting regex to alphanumeric, changing to lower case and to remove stop words
def clean_round1(text):
# [A-Za-z]+ this regex finds the words 
# The result of text is a list    
    text = re.findall('[A-Za-z]+',text)
# convert all the text collected from regex into lower case in a list using list comprehension
    text = [x.lower() for x in text]
# write all words other than stop words into a list
    text = [w for w in text if not w in stop_words]
# converting list into string    
    text = ' '.join(text)
    return text

In [6]:
# Applying the cleaning function to the reviewText
df['cleanText']=df['reviewText'].apply(lambda x: clean_round1(x))

In [7]:
# lemmatized text using spacy. stemming is not done as some cell will only have null values after stemming
nlp =spacy.load('en_core_web_sm')
df['lemmatext']=df['cleanText'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x)]))

In [8]:
# stemming is done to clean the data and reduce features, which will reduce the complexity of the model
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
df['stemtext'] = [ps.stem(x) for x in df['lemmatext']]


In [9]:
# encoding using count vectorizer. Bigrams and Trigrams are created to feed meaningful features to the model
cvec = CountVectorizer(stop_words=stop_words,max_features=6000,ngram_range=(1, 3))

# Fit our vectorizer on the lemmatized data
cvec.fit(df['stemtext'])

# and check out the length of the vectorized data after
len(cvec.get_feature_names())

6000

In [18]:
pickle.dump(cvec, open('transform.pkl', 'wb'))

In [10]:
# index is changed after vectorization. so index before vectorisation is saved as a list 
# and to used in the dataframe after vectorization.
index = []
for row in df['stemtext'].index: 
    index.append(row)

In [11]:
# Dataframe created with vectorized stemtext column using count vectorizer.
X_cvec = pd.DataFrame(cvec.transform(df['stemtext']).todense(),
                       columns=cvec.get_feature_names(),index=index)
X_cvec.head()

Unnamed: 0,ability,able,able buy,able find,able get,about,about buy,about cheap,about day,about every,...,year use,year work,yep,yes,yesterday,yet,yet but,youtube,zero,zero water
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# MODELLING

## TEXT CLASSIFICATION

In [12]:
lr = LogisticRegression(solver ='liblinear')
model = lr.fit(X_cvec, df['overall'])




In [13]:
filename = 'nlp_model.pkl'

In [16]:
pickle.dump(lr,open(filename,'wb'))