<a href="https://colab.research.google.com/github/BrianKipngeno/Store-reviews-classification-/blob/main/New_store_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I'll use the  dataset below to  create  classification models that tags new store reviews as either label 1 and label 2.

Dataset URL = https://bit.ly/2PdbtfD

### Prerequisites


In [None]:
# Importing the standard libraries
# ---
#
import pandas as pd # library for data manipulation
import numpy as np  # library for scientific compuations
import re           # regex library to perform text preprocessing
import string       # library to work with strings
import nltk         # library for natural language processing
import scipy        # library for scientific computing

# Library for Stop words
!pip3 install wordninja
!pip3 install textblob
import wordninja
from textblob import TextBlob

nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Library for Lemmatization
nltk.download('wordnet')
from textblob import Word

# Library for Noun count
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Library for TD-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/541.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/541.6 kB[0m [31m23.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.6/541.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541530 sha256=8523605dd66e2b64c5a8337419b576c06a3f66628d86bbbafa4726c6b07cc5c3
  Stored in directory: /root/.cache/pip/wheels/aa/44/3a/f2a5c1859b8b541ded969b4cd12d0a58897f12408f4f51e084
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Custom Functions

# Avg. words
def avg_word(sentence):
  words = sentence.split()
  try:
    z = (sum(len(word) for word in words)/len(words))
  except ZeroDivisionError:
    z = 0
  return z

# Noun count
pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except:
        pass
    return cnt

# Subjectivity
def get_subjectivity(tweet):
    try:
        textblob = TextBlob(unicode(tweet, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

# Polarity
def get_polarity(tweet):
    try:
        textblob = TextBlob(unicode(tweet, 'utf-8'))
        pol = textblob.sentiment.polarity
    except:
        pol = 0.0
    return pol

### Step 1: Data Exploration

In [None]:
# Loading our dataset

df = pd.read_csv('https://bit.ly/2PdbtfD')
df.columns = ['text', 'target']
df.sample(10)

Unnamed: 0,text,target
90,No instructions included - do not trust selle...,__label__1
76,Bad Deal!!: I ordered this DVD and received a...,__label__1
186,mask maker: It was good horrifying never a du...,__label__2
97,Ludicrous and silly: I remember getting this ...,__label__1
33,Is this great TV??? You bet it is: Hotel Baby...,__label__2
178,Pretty Nifty!: I purchased these for a Hallow...,__label__2
100,textbook: Book shipped quickly and was in exc...,__label__2
68,Great combination of creativity and adventure...,__label__2
54,Not the best: I bought both this and lonely p...,__label__1
102,"YES!!!: When I got this book, I wasn't expect...",__label__2


In [None]:
# Determining the shape of the datset
# ---
#
df.shape

(199, 2)

In [None]:
# We will work with 100 sample records because we would
# be required to use high computational resources for a larger dataset
# ---
#
df = df.sample(100)

In [None]:
# Let's determine whether our columns have the right data types
# ---
#
df.dtypes

Unnamed: 0,0
text,object
target,object


In [None]:
# What values are in our target variable?
# ---
#
df.target.unique()

array(['__label__2 ', '__label__1 '], dtype=object)

From the unique values, we need to trim the spaces in the values within our target variable

### Step 2: Data preparation

#### Basic data cleaning

In [None]:
# Trimming whitespaces in our target variable
# ---
#
df['target'] = df.target.str.strip()

# Checking our unique values
# ---
#
df.target.unique()

array(['__label__2', '__label__1'], dtype=object)

In [None]:
# Let's check for missing values
# ---
#
df.isnull().sum()

Unnamed: 0,0
text,0
target,0


#### Text processing

In [None]:
# We will create a custom function that will contain all the text cleaning
# techniques. We will then reuse the same function for cleaning new data.
# ---
#
def text_cleaning(text):
  # Removing url/links
  df['text'] = df.text.apply(lambda x: re.sub(r'http\S+|www\S+|https\S+','', str(x)))

  # Removing @ and # characters and replacing them with space
  df['text'] = df.text.str.replace('#',' ')
  df['text'] = df.text.str.replace('@',' ')

  # Conversion to lowercase
  df['text'] = df.text.apply(lambda x: " ".join(x.lower() for x in x.split()))

  # Removing punctuation characters
  df['text'] = df.text.str.replace('[^\w\s]','')

  # Removing stop words
  df['text'] = df.text.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

  # Lemmatization
  df['text'] = df.text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [None]:
# Applying the text_cleaning function to our dataframe.
# ---
#
df.text.apply(text_cleaning)
df.sample(5)

Unnamed: 0,text,target
116,amazing: ordered cd take abit get live england...,__label__2
88,"buyer beware!!: ordered cake topper june 27, 2...",__label__1
44,"autumn: got daughter nc, making prefect bread....",__label__2
177,make fascinating reading.: whale naturalist an...,__label__2
1,best soundtrack ever anything.: i'm reading lo...,__label__2


#### Feature engineering

In [None]:
# We will create a custom function that will contain all the
# feature engineering techniques. We can then use the function for cleaning new data.
# ---
#
def feature_engineering(text):
  # Length of text
  df['length_of_text'] = df.text.str.len()

  # Word count
  df['word_count'] = df.text.apply(lambda x: len(str(x).split(" ")))

  # Word density (Average no. of words / text)
  df['avg_word_length'] = df.text.apply(lambda x: avg_word(x))

  # Noun Count
  df['noun_count'] = df.text.apply(lambda x: pos_check(x, 'noun'))

  # Verb Count
  df['verb_count'] = df.text.apply(lambda x: pos_check(x, 'verb'))

  # Adjective Count / Text
  df['adj_count'] = df.text.apply(lambda x: pos_check(x, 'adj'))

  # Adverb Count / Text
  df['adv_count'] = df.text.apply(lambda x: pos_check(x, 'adv'))

  # Pronoun
  df['pron_count'] = df.text.apply(lambda x: pos_check(x, 'pron'))

  # Subjectivity
  df['subjectivity'] = df.text.apply(get_subjectivity)

  # Polarity
  df['polarity'] = df.text.apply(get_polarity)

In [None]:
# Applying the custom feature engineering function to our dataframe.
# ---
# This process may take 2-5 min.
# ---
#
df.text.apply(feature_engineering)
df.sample(5)

Unnamed: 0,text,target,length_of_text,word_count,avg_word_length,noun_count,verb_count,adj_count,adv_count,pron_count,subjectivity,polarity
67,even mommy fun one!: four year old daughter lo...,__label__2,214,36,4.972222,17,6,4,6,0,0.0,0.0
106,authentic: first encounter yoruba say cd reall...,__label__2,429,65,5.615385,27,12,11,6,2,0.0,0.0
21,delicious cookie mix: thought funny bought pro...,__label__2,539,81,5.666667,35,18,13,6,3,0.0,0.0
29,based review bought one i'm glad did!: vcr/dvd...,__label__2,554,82,5.768293,30,21,20,9,1,0.0,0.0
76,bad deal!!: ordered dvd received substitute ne...,__label__1,420,60,6.016667,24,14,9,6,1,0.0,0.0


In [None]:
# Performing further feature engineering techniques
# ---
#

# Feature Construction: Word Level N-Gram TF-IDF Feature
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', ngram_range=(1,3),  stop_words= 'english')
df_word_vect = tfidf.fit_transform(df.text)

# Feature Construction: Character Level N-Gram TF-IDF
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='char', ngram_range=(1,3),  stop_words= 'english')
df_char_vect = tfidf.fit_transform(df.text)



In [None]:
# Label Preparation i.e. replacing categorial values with numerical ones
# ---
#
y = np.array(df['target'].replace(['__label__1', '__label__2'], ['0','1']))
y

array(['1', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0',
       '0', '0', '1', '1', '1', '0', '0', '1', '0', '1', '0', '1', '0',
       '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '1',
       '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0',
       '1', '1', '0', '1', '0', '0', '1', '0', '1', '0', '1', '1', '1',
       '0', '1', '1', '1', '1', '0', '1', '1', '0', '1', '0', '0', '0',
       '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '0', '1', '0',
       '0', '1', '1', '1', '1', '1', '0', '1', '0'], dtype=object)

In [None]:
# Let's prepare the constructed features for modeling
# ---
# We will select all variables but the target (which is the label) and text variables
# ---
#
X_metadata = np.array(df[df.columns.difference(['target', 'text'])])
X_metadata

array([[  5.        ,   5.        ,   5.82142857, 190.        ,
         16.        ,   0.        ,   0.        ,   0.        ,
          1.        ,  28.        ],
       [  6.        ,   1.        ,   5.11111111, 164.        ,
         13.        ,   0.        ,   0.        ,   0.        ,
          4.        ,  27.        ],
       [ 13.        ,   5.        ,   5.81034483, 394.        ,
         30.        ,   0.        ,   0.        ,   0.        ,
          8.        ,  58.        ],
       [ 12.        ,   8.        ,   5.69565217, 307.        ,
         16.        ,   0.        ,   0.        ,   0.        ,
          7.        ,  46.        ],
       [ 12.        ,   9.        ,   6.04054054, 520.        ,
         26.        ,   0.        ,   0.        ,   0.        ,
         24.        ,  74.        ],
       [  4.        ,   3.        ,   5.74418605, 289.        ,
         20.        ,   0.        ,   0.        ,   0.        ,
          9.        ,  43.        ],
       [  

In [None]:
# We combine our two tfidf (sparse) matrices and X_metadata
# ---
#
X = scipy.sparse.hstack([df_word_vect, df_char_vect, X_metadata])
X

<100x2010 sparse matrix of type '<class 'numpy.float64'>'
	with 32955 stored elements in COOrdinate format>

### Step 3: Data modelling

In this step we use machine learning algorithms to train and test our sentiment analysis models.

In [None]:
# Splitting our data
# ---
#
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fitting our model
# ---
#

# Importing the algorithms
# ---
#
from sklearn.linear_model import LogisticRegression      # Logistic Regression Classifier
from sklearn.tree import DecisionTreeClassifier          # Decision Tree Classifier
from sklearn.svm import SVC                              # SVM Classifier
from sklearn.naive_bayes import MultinomialNB            # Naive Bayes Classifier
from sklearn.neighbors import KNeighborsClassifier       # KNN Classifier

# Ensemble classifiers
from sklearn.ensemble import BaggingClassifier           # Bagging Meta-Estimator Classifier
from sklearn.ensemble import RandomForestClassifier      # RandomForest Classifier
from sklearn.ensemble import AdaBoostClassifier          # AdaBoost Classifier
from sklearn.ensemble import GradientBoostingClassifier  # AdaBoost GradientBoostingClassifier


# Instantiating our models
# ---
#
logistic_classifier = LogisticRegression(solver='saga', max_iter=800, multi_class='multinomial') # solver works well with a large dataset like ours
decision_classifier = DecisionTreeClassifier(random_state=42)
svm_classifier = SVC()
knn_classifier = KNeighborsClassifier()
naive_classifier = MultinomialNB()

bagging_meta_classifier = BaggingClassifier()
random_forest_classifier = RandomForestClassifier()
ada_boost_classifier = AdaBoostClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
# Training our models
# ---
#
logistic_classifier.fit(X_train, y_train)
decision_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
naive_classifier.fit(X_train, y_train)

bagging_meta_classifier.fit(X_train, y_train)
random_forest_classifier.fit(X_train, y_train)
ada_boost_classifier.fit(X_train, y_train)
gbm_classifier.fit(X_train, y_train)



In [None]:
# Making predictions
# ---
#
logistic_y_prediction = logistic_classifier.predict(X_test)
decision_y_prediction = decision_classifier.predict(X_test)
svm_y_prediction = svm_classifier.predict(X_test)
knn_y_prediction = knn_classifier.predict(X_test)
naive_y_prediction = naive_classifier.predict(X_test)

bagging_y_classifier = bagging_meta_classifier.predict(X_test)
random_forest_y_classifier = random_forest_classifier.predict(X_test)
ada_boost_y_classifier = ada_boost_classifier.predict(X_test)
gbm_y_classifier = gbm_classifier.predict(X_test)

In [None]:
# Evaluating the Models
# ---
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Accuracy scores
#
print("Logistic Regression Classifier", accuracy_score(logistic_y_prediction, y_test))
print("Decision Trees Classifier", accuracy_score(decision_y_prediction, y_test))
print("SVN Classifier", accuracy_score(svm_y_prediction, y_test))
print("KNN Classifier", accuracy_score(knn_y_prediction, y_test))
print("Naive Bayes Classifier", accuracy_score(naive_y_prediction, y_test))

print("Bagging Classifier", accuracy_score(bagging_y_classifier, y_test))
print("Random Forest Classifier", accuracy_score(random_forest_y_classifier, y_test))
print("Ada Boost Classifier", accuracy_score(ada_boost_y_classifier, y_test))
print("GBM Classifier", accuracy_score(gbm_y_classifier, y_test))

Logistic Regression Classifier 0.75
Decision Trees Classifier 0.55
SVN Classifier 0.4
KNN Classifier 0.55
Naive Bayes Classifier 0.5
Bagging Classifier 0.7
Random Forest Classifier 0.55
Ada Boost Classifier 0.55
GBM Classifier 0.6


In [None]:
# Confusion matrix
# ---
# Regardless of the size of the confusion matrix, the method for intepretation is the same.
# The left-hand side contains the predicted values and the actual class labels run across the top.
# The instances that the classifier has correctly predicted run diagonally from the top-left
# to the bottom-right.
# ---
#
print('Logistic Regression Classifier:')
print(confusion_matrix(logistic_y_prediction, y_test))

print('Decision Trees Classifier:')
print(confusion_matrix(decision_y_prediction, y_test))

print('SVN Classifier:')
print(confusion_matrix(svm_y_prediction, y_test))

print('KNN Classifier:')
print(confusion_matrix(knn_y_prediction, y_test))

print('Naive Bayes Classifier:')
print(confusion_matrix(naive_y_prediction, y_test))

print('Bagging Classifier:')
print(confusion_matrix(bagging_y_classifier, y_test))

print('Random Forest Classifier:')
print(confusion_matrix(random_forest_y_classifier, y_test))

print('Ada Boost Classifier:')
print(confusion_matrix(ada_boost_y_classifier, y_test))

print('GBM Classifier:')
print(confusion_matrix(gbm_y_classifier, y_test))

Logistic Regression Classifier:
[[10  4]
 [ 1  5]]
Decision Trees Classifier:
[[7 5]
 [4 4]]
SVN Classifier:
[[5 6]
 [6 3]]
KNN Classifier:
[[5 3]
 [6 6]]
Naive Bayes Classifier:
[[ 1  0]
 [10  9]]
Bagging Classifier:
[[10  5]
 [ 1  4]]
Random Forest Classifier:
[[5 3]
 [6 6]]
Ada Boost Classifier:
[[8 6]
 [3 3]]
GBM Classifier:
[[8 5]
 [3 4]]


In [None]:
# Classification Reports
# ---
#
print("Logistic Regression Classifier", classification_report(logistic_y_prediction, y_test))
print("Decision Trees Classifier", classification_report(decision_y_prediction, y_test))
print("SVN Classifier", classification_report(svm_y_prediction, y_test))
print("KNN Classifier", classification_report(knn_y_prediction, y_test))
print("Naive Bayes Classifier", classification_report(naive_y_prediction, y_test))

print("Bagging Classifier", classification_report(bagging_y_classifier, y_test))
print("Random Forest Classifier", classification_report(random_forest_y_classifier, y_test))
print("Ada Boost Classifier", classification_report(ada_boost_y_classifier, y_test))
print("GBM Classifier", classification_report(gbm_y_classifier, y_test))

Logistic Regression Classifier               precision    recall  f1-score   support

           0       0.91      0.71      0.80        14
           1       0.56      0.83      0.67         6

    accuracy                           0.75        20
   macro avg       0.73      0.77      0.73        20
weighted avg       0.80      0.75      0.76        20

Decision Trees Classifier               precision    recall  f1-score   support

           0       0.64      0.58      0.61        12
           1       0.44      0.50      0.47         8

    accuracy                           0.55        20
   macro avg       0.54      0.54      0.54        20
weighted avg       0.56      0.55      0.55        20

SVN Classifier               precision    recall  f1-score   support

           0       0.45      0.45      0.45        11
           1       0.33      0.33      0.33         9

    accuracy                           0.40        20
   macro avg       0.39      0.39      0.39        20
we

Evaluation our Models

- Accuracy: the percentage of texts that were assigned the correct topic.
- Precision: the percentage of texts the classifier classified correctly out of the total number of texts it predicted for each topic
- Recall: the percentage of texts the model predicted for each topic out of the total number of texts it should have predicted for that topic.
- F1 Score: the average of both precision and recall.