In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/My Drive/new_textblob_vader (1).csv')
df.head()

Unnamed: 0,review_id,Bank,review,ratings,thumbs_Up_Count,review_App_Version,review_time,manual_labels,ratings_label,tokens,pos_tags,polarity,subjectivity,textblob_label,scores1
0,101,Barclays,not smoothest easiest apps navigate . but choi...,2,28,2.89.0,2024-02-27 12:49:00,Neutral,Negative,not smoothest easiest apps navigate . but choi...,"[('not', 'RB'), ('smoothest', 'JJS'), ('easies...",0.3,0.376667,Positive,0.028302
1,102,Barclays,app has good features eg let 's track spend ca...,3,102,2.88.2,2024-02-10 09:35:00,Neutral,Neutral,app has good features eg let 's track spend ca...,"[('app', 'NN'), ('has', 'VBZ'), ('good', 'JJ')...",0.235926,0.440741,Positive,0.041358
2,103,Barclays,easy app use . easy navigate fast making trans...,5,84,2.89.0,2024-02-20 11:33:00,Positive,Positive,easy app use . easy navigate fast making trans...,"[('easy', 'JJ'), ('app', 'NN'), ('use', 'NN'),...",0.295833,0.557407,Positive,0.050223
3,104,Barclays,pointless app . i can see much i owe and pay o...,1,0,2.89.0,2024-02-28 14:06:00,Negative,Negative,pointless app . i can see much i owe and pay o...,"[('pointless', 'NN'), ('app', 'NN'), ('.', '.'...",0.094444,0.316667,Neutral,0.011333
4,105,Barclays,update . issue was resolved quickly and i 'm r...,5,2,2.89.0,2024-02-27 16:41:00,Positive,Positive,update . issue was resolved quickly and i 'm r...,"[('update', 'NN'), ('.', '.'), ('issue', 'NN')...",0.266667,0.616667,Positive,0.057576


In [4]:
columns_to_drop = ['polarity', 'subjectivity','textblob_label', 'scores1']
df.drop(columns=columns_to_drop, inplace=True)
df.head()

Unnamed: 0,review_id,Bank,review,ratings,thumbs_Up_Count,review_App_Version,review_time,manual_labels,ratings_label,tokens,pos_tags
0,101,Barclays,not smoothest easiest apps navigate . but choi...,2,28,2.89.0,2024-02-27 12:49:00,Neutral,Negative,not smoothest easiest apps navigate . but choi...,"[('not', 'RB'), ('smoothest', 'JJS'), ('easies..."
1,102,Barclays,app has good features eg let 's track spend ca...,3,102,2.88.2,2024-02-10 09:35:00,Neutral,Neutral,app has good features eg let 's track spend ca...,"[('app', 'NN'), ('has', 'VBZ'), ('good', 'JJ')..."
2,103,Barclays,easy app use . easy navigate fast making trans...,5,84,2.89.0,2024-02-20 11:33:00,Positive,Positive,easy app use . easy navigate fast making trans...,"[('easy', 'JJ'), ('app', 'NN'), ('use', 'NN'),..."
3,104,Barclays,pointless app . i can see much i owe and pay o...,1,0,2.89.0,2024-02-28 14:06:00,Negative,Negative,pointless app . i can see much i owe and pay o...,"[('pointless', 'NN'), ('app', 'NN'), ('.', '.'..."
4,105,Barclays,update . issue was resolved quickly and i 'm r...,5,2,2.89.0,2024-02-27 16:41:00,Positive,Positive,update . issue was resolved quickly and i 'm r...,"[('update', 'NN'), ('.', '.'), ('issue', 'NN')..."


In [5]:
#RATINGS_LABEL
# Resampling the minority classes
df_neg = df[df['ratings_label'] == 'Negative']
df_pos = df[df['ratings_label'] == 'Positive']
df_neu = df[df['ratings_label'] == 'Neutral']

In [6]:
label_counts = df['ratings_label'].value_counts()

In [7]:
df_pos_res = resample(df_pos, replace=True, n_samples=len(df_neg), random_state=123)
df_neu_res = resample(df_neu, replace=True, n_samples=len(df_neg), random_state=123)

In [8]:
# Combine resampled data
df_balanced = pd.concat([df_neg, df_pos_res, df_neu_res])

In [9]:
X = df_balanced['tokens']
y = df_balanced['ratings_label']

In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
#SUPPORT VECTOR CLASSIFIER (SVC)
#TRAIN AND EVALUATE THE MODEL
from sklearn.svm import SVC

In [12]:
# Configure the pipeline with TfidfVectorizer and SVC
model = make_pipeline(CountVectorizer(), SVC())

In [13]:
# Fit the model on the training data
model.fit(X_train, y_train)

In [14]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [15]:
# Print the classification report and accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9088345864661654
              precision    recall  f1-score   support

    Negative       0.87      0.90      0.88       368
     Neutral       0.94      0.93      0.94       351
    Positive       0.92      0.90      0.91       345

    accuracy                           0.91      1064
   macro avg       0.91      0.91      0.91      1064
weighted avg       0.91      0.91      0.91      1064



In [16]:
#NAIVE BAYES
#TRAIN AND EVALUATE THE MODEL
from sklearn.naive_bayes import MultinomialNB

In [17]:
# Configure the pipeline with TfidfVectorizer and MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

In [18]:
# Fit the model on the training data
model.fit(X_train, y_train)

In [19]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [20]:
# Print the classification report and accuracy (NB)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.831766917293233
              precision    recall  f1-score   support

    Negative       0.80      0.80      0.80       368
     Neutral       0.82      0.85      0.83       351
    Positive       0.88      0.85      0.87       345

    accuracy                           0.83      1064
   macro avg       0.83      0.83      0.83      1064
weighted avg       0.83      0.83      0.83      1064



In [21]:
#RANDOM FOREST
#TRAIN AND EVALUATE THE MODEL
from sklearn.ensemble import RandomForestClassifier

In [23]:
# Configure the pipeline with TfidfVectorizer and RandomForestClassifier
model = make_pipeline(CountVectorizer(), RandomForestClassifier(random_state=0))

In [24]:
# Fit the model on the training data
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

In [26]:
# Print the classification report and accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.931390977443609
              precision    recall  f1-score   support

    Negative       0.90      0.94      0.92       368
     Neutral       1.00      0.93      0.96       351
    Positive       0.91      0.93      0.92       345

    accuracy                           0.93      1064
   macro avg       0.93      0.93      0.93      1064
weighted avg       0.93      0.93      0.93      1064



In [27]:
#LOGISTIC REGRESSION
#TRAIN AND EVALUATE THE MODEL
from sklearn.linear_model import LogisticRegression

In [28]:
# Configure the pipeline with TfidfVectorizer and LogisticRegression
model = make_pipeline(CountVectorizer(), LogisticRegression(random_state=0))

In [29]:
# Fit the model on the training data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [31]:
# Print the classification report and accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8881578947368421
              precision    recall  f1-score   support

    Negative       0.92      0.79      0.85       368
     Neutral       0.85      0.96      0.90       351
    Positive       0.91      0.92      0.91       345

    accuracy                           0.89      1064
   macro avg       0.89      0.89      0.89      1064
weighted avg       0.89      0.89      0.89      1064



In [33]:
#MANUAL_LABELS
# Resampling the minority classes
df_neg = df[df['manual_labels'] == 'Negative']
df_pos = df[df['manual_labels'] == 'Positive']
df_neu = df[df['manual_labels'] == 'Neutral']

In [34]:
df_pos_res = resample(df_pos, replace=True, n_samples=len(df_neg), random_state=123)
df_neu_res = resample(df_neu, replace=True, n_samples=len(df_neg), random_state=123)

In [35]:
# Combine resampled data
df_balanced = pd.concat([df_neg, df_pos_res, df_neu_res])

In [36]:
X = df_balanced['tokens']
y = df_balanced['manual_labels']

In [37]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [38]:
#SUPPORT VECTOR CLASSIFIER (SVC)
#TRAIN AND EVALUATE THE MODEL
model = make_pipeline(CountVectorizer(), SVC())
model.fit(X_train, y_train)

In [39]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [40]:
# Print the classification report and accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8882410106899903
              precision    recall  f1-score   support

    Negative       0.87      0.87      0.87       364
     Neutral       0.86      0.85      0.86       344
    Positive       0.93      0.95      0.94       321

    accuracy                           0.89      1029
   macro avg       0.89      0.89      0.89      1029
weighted avg       0.89      0.89      0.89      1029



In [41]:
#NAIVE BAYES (NB)
#TRAIN AND EVALUATE THE MODEL
model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

In [42]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [43]:
# Print the classification report and accuracy (NB)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8172983479105929
              precision    recall  f1-score   support

    Negative       0.81      0.76      0.79       364
     Neutral       0.78      0.78      0.78       344
    Positive       0.86      0.91      0.88       321

    accuracy                           0.82      1029
   macro avg       0.82      0.82      0.82      1029
weighted avg       0.82      0.82      0.82      1029



In [44]:
#RANDOM FOREST (RF)
#TRAIN AND EVALUATE THE MODEL
model = make_pipeline(CountVectorizer(), RandomForestClassifier(random_state=0))
model.fit(X_train, y_train)

In [45]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [46]:
# Print the classification report and accuracy (RF)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9144800777453839
              precision    recall  f1-score   support

    Negative       0.89      0.91      0.90       364
     Neutral       0.92      0.87      0.89       344
    Positive       0.93      0.96      0.95       321

    accuracy                           0.91      1029
   macro avg       0.92      0.92      0.92      1029
weighted avg       0.91      0.91      0.91      1029



In [50]:
#LOGISTIC REGRESSION (LR)
#TRAIN AND EVALUATE THE MODEL
model = make_pipeline(CountVectorizer(), LogisticRegression(random_state=0))
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
# Predict the labels for the test set
y_pred = model.predict(X_test)

In [52]:
# Print the classification report and accuracy (LR)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8814382896015549
              precision    recall  f1-score   support

    Negative       0.93      0.78      0.85       364
     Neutral       0.83      0.92      0.87       344
    Positive       0.90      0.96      0.93       321

    accuracy                           0.88      1029
   macro avg       0.89      0.89      0.88      1029
weighted avg       0.89      0.88      0.88      1029

