# Import Libraries

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
random.seed(42)
import itertools
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

# Import Dataset

In [3]:
reviews_df = pd.read_csv('/Users/hilmi/Desktop/NLP Sentiment Analysis Project/Amazon_Alexa.tsv',sep='\t')

In [4]:
reviews_df

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


# Drop irrelevant features, 'date', 'rating'

In [5]:
# We will not use 'date','rating' and 'lenght' of the reviews in the analysis and will drop these columns from reviews_df
reviews_df = reviews_df.drop(['date','rating'], axis = 1)

# Convert Categorical Variables into numeric values: pd.get_dummies

In [6]:
# 'variation' variable is a categorical variable and should be converted into a numerical value.
variation_dummies =pd.get_dummies(reviews_df['variation'], drop_first = True)
#Avoid Dummy Variable trap which occurs when one variable can be predicted from the other.

In [7]:
variation_dummies

Unnamed: 0,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,Walnut Finish,White,White Dot,White Plus,White Show,White Spot
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3146,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3147,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3148,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0



# Drop Categorical Variable Column

In [8]:
#Let's drop the 'variation' column from reviews_df dataset.
reviews_df.drop(['variation'], axis = 1, inplace = True)

In [9]:
reviews_df

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1
...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1
3146,"Listening to music, searching locations, check...",1
3147,"I do love these things, i have them running my...",1
3148,Only complaint I have is that the sound qualit...,1


# Concatenate reviews_df with dummy variables

In [10]:
#Now Let's add the dummies into te reviews_df
reviews_df = pd.concat([reviews_df, variation_dummies], axis = 1)

In [11]:
reviews_df

Unnamed: 0,verified_reviews,feedback,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,Walnut Finish,White,White Dot,White Plus,White Show,White Spot
0,Love my Echo!,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,Loved it!,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,"Sometimes while playing a game, you can answer...",1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,I have had a lot of fun with this thing. My 4 ...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,Music,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3146,"Listening to music, searching locations, check...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3147,"I do love these things, i have them running my...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3148,Only complaint I have is that the sound qualit...,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


# NLP Preprocessing - Data Cleaning

# Clean punctuations

In [12]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Clean Stopwords

In [13]:
import nltk #Natural Language Tool Kit

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/hilmi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# We have to download stopwords Package to execute this command
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
# Let's define a pipeline to clean up all the reviews 

# The pipeline performs the following: (1) remove punctuation, (2) remove stopwords

def review_cleaning (review):
    review_punc_removed = [char for char in review if char not in string.punctuation]
    review_punc_removed_join = ''.join(review_punc_removed)
    review_punc_removed_clean = [word for word in review_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return review_punc_removed_clean

# Implementing cleaning function into verified_reviews column

In [17]:
reviews_df['cleaned_reviews'] = reviews_df['verified_reviews'].apply(review_cleaning)

In [18]:
reviews_df

Unnamed: 0,verified_reviews,feedback,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,Walnut Finish,White,White Dot,White Plus,White Show,White Spot,cleaned_reviews
0,Love my Echo!,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"[Love, Echo]"
1,Loved it!,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,[Loved]
2,"Sometimes while playing a game, you can answer...",1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"[Sometimes, playing, game, answer, question, c..."
3,I have had a lot of fun with this thing. My 4 ...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"[lot, fun, thing, 4, yr, old, learns, dinosaur..."
4,Music,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,[Music]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[Perfect, kids, adults, everyone]"
3146,"Listening to music, searching locations, check...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[Listening, music, searching, locations, check..."
3147,"I do love these things, i have them running my...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[love, things, running, entire, home, TV, ligh..."
3148,Only complaint I have is that the sound qualit...,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[complaint, sound, quality, isnt, great, mostl..."


# Test the cleaning function

In [19]:
print(reviews_df['cleaned_reviews'][3]) # show the cleaned up version

['lot', 'fun', 'thing', '4', 'yr', 'old', 'learns', 'dinosaurs', 'control', 'lights', 'play', 'games', 'like', 'categories', 'nice', 'sound', 'playing', 'music', 'well']


In [20]:
#let's check out the original review
print(reviews_df['verified_reviews'][3])

I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.


# Implementing CounVectorizer 

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
# Define the cleaning pipeline we defined earlier
vectorizer = CountVectorizer(analyzer = review_cleaning)
reviews_countvectorizer = vectorizer.fit_transform(reviews_df['verified_reviews'])

In [22]:
print('Vocabulary length :', len(vectorizer.get_feature_names()))

Vocabulary length : 5211


In [23]:
#Let's convert numpy object into a numpy array
reviews_countvectorizer = reviews_countvectorizer.toarray() 

In [24]:
#Let's convert numpy object into pd.dataframe
reviews = pd.DataFrame(reviews_countvectorizer)

In [25]:
#Let's drop reviews_df['verified_reviews'] colum and add vectorized reviews
reviews_df = reviews_df.drop(['verified_reviews','cleaned_reviews'], axis = 1)
reviews_df = pd.concat([reviews_df, reviews], axis=1)

In [26]:
reviews_df.shape

(3150, 5227)

# Split X and Y variables

In [28]:
#Let's X (matrix of features) and y (target variable) 
X = reviews_df.drop(['feedback'], axis = 1)

In [30]:
X.shape

(3150, 5226)

In [29]:
y = reviews_df['feedback']

In [31]:
y.shape

(3150,)

# Split training and testing datasets

In [32]:
#Let's split trai and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
print("X_train Shape\t\t:{}\ny_train Shape\t\t:{}\nX_test Shape\t\t:{}\ny_test Shape\t\t:{}"
      .format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

X_train Shape		:(2520, 5226)
y_train Shape		:(2520,)
X_test Shape		:(630, 5226)
y_test Shape		:(630,)


# Training Random Forest Classifier

In [35]:
Classifier = RandomForestClassifier(random_state = 42)

In [37]:
Classifier.fit(X_train, y_train)    

RandomForestClassifier(random_state=42)

In [38]:
y_pred = Classifier.predict(X_test)

In [39]:
# Assign f1 score to a variable
score = f1_score(y_test, y_pred, average = 'weighted')
score

0.9228956228956228

In [40]:
#Printing Classification Report
print('Classification Report\t\t:\n', classification_report(y_test, y_pred))

Classification Report		:
               precision    recall  f1-score   support

           0       0.62      0.31      0.42        48
           1       0.95      0.98      0.96       582

    accuracy                           0.93       630
   macro avg       0.79      0.65      0.69       630
weighted avg       0.92      0.93      0.92       630



# Random Forest Classifier Hyperparameter Tuning

In [42]:
from sklearn.ensemble import RandomForestClassifier
Classifier = RandomForestClassifier(random_state = 42)
from pprint import pprint

# Look at parameters used by our current forest

In [43]:
print('Parameters currently in use:\n')
pprint(Classifier.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


# Create a parameter grid

In [44]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


# Random Search Training

In [46]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
Classifier = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
Classifier_random = RandomizedSearchCV(estimator = Classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
Classifier_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

# View the best parameters from fitting the random search

In [47]:
Classifier_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': True}

# Evaluate Random Search

In [56]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    score = f1_score(test_labels, predictions, average = 'weighted')
    print('Model Performance')
    print('f1_Score = {:0.2f}%.'.format(score))
    
    return score

In [60]:
base_model = RandomForestClassifier(random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
f1_Score = 0.92%.


In [61]:
best_random = Classifier_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
f1_Score = 0.93%.


# Check Improvement

In [62]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 0.99%.
