<a href="https://colab.research.google.com/github/aleksanderprofic/Machine-Learning/blob/master/NaturalLanguageProcessing/SentimentAnalysis/restaurant_reviews_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

quoting=3 - ignore quotes in text

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
stemmer = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
all_stopwords.remove("isn't")

for i, row in dataset.iterrows():
    review = re.sub('[^a-zA-Z ]', ' ', row[0])
    review = review.lower()
    review = review.split()
    
    review = ' '.join([stemmer.stem(word) for word in review if word not in set(all_stopwords)])
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(corpus[:5])

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price']


## Creating the Bag of Words model

We include only most frequently used words. There are 1566 words, but we take only 1500

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training and predicting the results

### Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

nb = GaussianNB()
accuracies = cross_val_score(estimator=nb, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print('Mean accuracy: {:.2f}%'.format(accuracies.mean() * 100))
print('Standard deviation: {:.2f}%'.format(accuracies.std() * 100))

Mean accuracy: 67.25%
Standard deviation: 5.30%


#### Predicting the Test set results and making the Confusion Matrix

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score

nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix: \n{cm}')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred) * 100))
print('F1 score: {:.2f}%'.format(f1_score(y_test, y_pred) * 100))

Confusion matrix: 
[[55 42]
 [12 91]]
Accuracy: 73.00%
Recall: 88.35%
F1 score: 77.12%


### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [0.1, 0.25, 0.5, 0.75, 1.0], 'max_iter': [100, 200, 300], 'penalty': ['None', 'l2']}]

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best params: {grid_search.best_params_}')

Best accuracy: 80.50%
Standard deviation: 4.30%
Best params: {'C': 1.0, 'max_iter': 100, 'penalty': 'l2'}


In [10]:
classifier = LogisticRegression(C=1.0, max_iter=100, penalty='l2')
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Predicting the Test set results and making the Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score

classifier = LogisticRegression(C=1.0, max_iter=100)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix: \n{cm}')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred) * 100))
print('F1 score: {:.2f}%'.format(f1_score(y_test, y_pred) * 100))

Confusion matrix: 
[[80 17]
 [28 75]]
Accuracy: 77.50%
Recall: 72.82%
F1 score: 76.92%


### K-Nearest Neighbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

parameters = [{'n_neighbors': [5, 10, 15, 20, 30, 50]}]

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best params: {grid_search.best_params_}')

Best accuracy: 71.00%
Standard deviation: 4.02%
Best params: {'n_neighbors': 30}


In [13]:
knn_classifier = KNeighborsClassifier(n_neighbors=30)
knn_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=30, p=2,
                     weights='uniform')

#### Predicting the Test set results and making the Confusion Matrix

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score

y_pred = knn_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix: \n{cm}')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred) * 100))
print('F1 score: {:.2f}%'.format(f1_score(y_test, y_pred) * 100))

Confusion matrix: 
[[84 13]
 [50 53]]
Accuracy: 68.50%
Recall: 51.46%
F1 score: 62.72%


### Decision Trees

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

parameters = [{'criterion': ['gini', 'entropy']}]

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best params: {grid_search.best_params_}')

Best accuracy: 76.75%
Standard deviation: 5.13%
Best params: {'criterion': 'gini'}


In [19]:
tree_classifier = DecisionTreeClassifier(criterion='gini')
tree_classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

#### Predicting the Test set results and making the Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score

y_pred = tree_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix: \n{cm}')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred) * 100))
print('F1 score: {:.2f}%'.format(f1_score(y_test, y_pred) * 100))

Confusion matrix: 
[[77 20]
 [32 71]]
Accuracy: 74.00%
Recall: 68.93%
F1 score: 73.20%


### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = [{'criterion': ['gini', 'entropy'], 'n_estimators': [10, 20, 30, 50, 70, 100, 150]}]

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1, cv=10)
grid_search.fit(X_train, y_train)

print('Best accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best params: {grid_search.best_params_}')

Best accuracy: 79.25%
Standard deviation: 3.17%
Best params: {'criterion': 'gini', 'n_estimators': 150}


In [22]:
rf_classifier = RandomForestClassifier(criterion='gini', n_estimators=150)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### Predicting the Test set results and making the Confusion Matrix

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score

y_pred = rf_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix: \n{cm}')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred) * 100))
print('F1 score: {:.2f}%'.format(f1_score(y_test, y_pred) * 100))

Confusion matrix: 
[[89  8]
 [36 67]]
Accuracy: 78.00%
Recall: 65.05%
F1 score: 75.28%


### Support Vector Machine

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

parameters = [{'C': [0.25, 0.5, 0.75, 1.0], 'gamma': ['scale', 0.1, 0.3, 0.5, 0.7, 0.9], 'kernel': ['rbf']},
              {'C': [0.25, 0.5, 0.75, 1.0], 'kernel': ['linear']}]

grid_search = GridSearchCV(estimator=SVC(), param_grid=parameters, scoring='accuracy', cv=10)
grid_search.fit(X_train, y_train)

print('Best accuracy: {:.2f}%'.format(grid_search.best_score_ * 100))
print('Standard deviation: {:.2f}%'.format(grid_search.cv_results_['std_test_score'][grid_search.best_index_] * 100))
print(f'Best params: {grid_search.best_params_}')

Best accuracy: 80.00%
Standard deviation: 4.81%
Best params: {'C': 0.5, 'kernel': 'linear'}


In [25]:
svc = SVC(C=0.5, kernel='linear')
svc.fit(X_train, y_train)

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#### Predicting the Test set results and making the Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score

y_pred = svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix: \n{cm}')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred) * 100))
print('Recall: {:.2f}%'.format(recall_score(y_test, y_pred) * 100))
print('F1 score: {:.2f}%'.format(f1_score(y_test, y_pred) * 100))

Confusion matrix: 
[[80 17]
 [32 71]]
Accuracy: 75.50%
Recall: 68.93%
F1 score: 74.35%


Naive Bayes and Logistic Regression seem like the best models for this problem.