In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 
import os

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning Models
from sklearn import svm  
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

# Model Evaluation Libraries
from sklearn.metrics import classification_report, confusion_matrix

In [48]:
data = pd.read_csv("RUD-2.csv")

In [49]:
data.head()

Unnamed: 0,Text,Sentiment (POS/NEG/NEU)
0,Shan Food ki quality bohat zabardast ha ...boh...,pos
1,ye bohat mazaydar ha,pos
2,"Shan food bohat achi company hain, mujay in k ...",pos
3,bohat acha pakistani brand ha..zabardast quality,pos
4,Hamare ghar me yehi msale use hote hain meri a...,pos


In [25]:
train =  pd.read_csv('RUD-2.csv')
test =   pd.read_csv('RUD-2.csv')
print('Shape of Training Set ' , train.shape,'\nShape of Testing Set ', test.shape)

Shape of Training Set  (21940, 2) 
Shape of Testing Set  (21940, 2)


In [26]:
data =  pd.concat([train, test]).reset_index(drop=True)
print(data.shape)

(43880, 2)


In [27]:
df =  data.copy()

In [28]:
df.head()

Unnamed: 0,Text,Sentiment (POS/NEG/NEU)
0,Shan Food ki quality bohat zabardast ha ...boh...,pos
1,ye bohat mazaydar ha,pos
2,"Shan food bohat achi company hain, mujay in k ...",pos
3,bohat acha pakistani brand ha..zabardast quality,pos
4,Hamare ghar me yehi msale use hote hain meri a...,pos


In [29]:
#sns.countplot( x = 'Sentiment (POS/NEG/NEU)', data = df );

In [30]:
le = LabelEncoder()
le.fit(df['Sentiment (POS/NEG/NEU)'])
df['encoded_Sentiment (POS/NEG/NEU)'] = le.transform(df['Sentiment (POS/NEG/NEU)'])

In [31]:
df.head()

Unnamed: 0,Text,Sentiment (POS/NEG/NEU),encoded_Sentiment (POS/NEG/NEU)
0,Shan Food ki quality bohat zabardast ha ...boh...,pos,15
1,ye bohat mazaydar ha,pos,15
2,"Shan food bohat achi company hain, mujay in k ...",pos,15
3,bohat acha pakistani brand ha..zabardast quality,pos,15
4,Hamare ghar me yehi msale use hote hain meri a...,pos,15


In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(df['Text'], df['encoded_Sentiment (POS/NEG/NEU)'], test_size = 0.30, random_state = 7)

In [33]:
print('Shape of X_train', X_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of Y_train', Y_train.shape)
print('Shape of Y_test', Y_test.shape)

Shape of X_train (30716,)
Shape of X_test (13164,)
Shape of Y_train (30716,)
Shape of Y_test (13164,)


In [34]:
max_feature_num = 50000
vectorizer = TfidfVectorizer(max_features=max_feature_num)
train_vecs = vectorizer.fit_transform(X_train)
test_vecs = TfidfVectorizer(max_features=max_feature_num, vocabulary=vectorizer.vocabulary_).fit_transform(X_test)

In [35]:
def SVM_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    SVM = svm.LinearSVC(max_iter=100)
    SVM.fit(train_vecs, Y_train)

    # Testing
    test_predictionSVM = SVM.predict(test_vecs)
    return classification_report(test_predictionSVM, Y_test), confusion_matrix(test_predictionSVM, Y_test)

In [36]:
def LR_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    LR = LogisticRegression()
    LR.fit(train_vecs, Y_train)

    # testing
    test_predictionLR = LR.predict(test_vecs)
    return classification_report(test_predictionLR, Y_test) , confusion_matrix(test_predictionLR, Y_test)

In [37]:
def DT_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    DT = DecisionTreeClassifier(max_depth = 9, random_state = 23 )
    DT.fit(train_vecs, Y_train)

    # Testing
    test_predictionDT = DT.predict(test_vecs)
    return classification_report(test_predictionDT, Y_test), confusion_matrix(test_predictionDT, Y_test) 

In [38]:
def XGB_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    XGB = xgb.XGBClassifier(colsample_bytree = 0.2, learning_rate = 0.01, n_estimators = 100)
    XGB.fit(train_vecs, Y_train)

    # Testing
    test_predictionXGB = XGB.predict(test_vecs)
    return classification_report(test_predictionXGB, Y_test), confusion_matrix(test_predictionXGB, Y_test) 

In [39]:
def RF_classifier(train_vecs, Y_train, test_vecs, Y_test):
    # Training
    RF = RandomForestClassifier(n_estimators = 450, max_depth=9, random_state=43)
    RF.fit(train_vecs, Y_train)

    # Testing
    test_predictionRF = RF.predict(test_vecs)
    return classification_report(test_predictionRF, Y_test), confusion_matrix(test_predictionRF, Y_test)

In [40]:
class_report , conf_matrix = SVM_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of SVM Classifier on TF-IDF Vectorizer')
print(class_report)
print(conf_matrix)

Results of SVM Classifier on TF-IDF Vectorizer
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       865
           2       0.71      0.78      0.74      1372
           4       0.67      1.00      0.80         2
           6       0.88      0.83      0.85      1817
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         0
           9       0.80      0.83      0.81      1807
          10       0.00      0.00      0.00         0
          11       0.93      0.87      0.90      5729
          12       1.00      1.00      1.00         1
          15       0.83      0.90      0.86      1569

    accuracy                           0.85     13164
   macro avg       0.60      0.64      0.62     13164
weighted avg       0.86      0.85      0.85     13164

[[ 736   65    0   20    0    0   29    0   15    0    0]
 [  70 1067    1  126    0    0   21    0   77    0   10]
 [   0    0    2    0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
class_report , conf_matrix = LR_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of Logistic Regression Classifier on TF-IDF Vectorizer')
print(class_report)
print(conf_matrix)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results of Logistic Regression Classifier on TF-IDF Vectorizer
              precision    recall  f1-score   support

           0       0.50      0.77      0.60       614
           2       0.56      0.68      0.61      1231
           4       0.00      0.00      0.00         0
           6       0.78      0.78      0.78      1710
           8       0.00      0.00      0.00         0
           9       0.61      0.77      0.68      1492
          10       0.00      0.00      0.00         0
          11       0.93      0.74      0.83      6733
          12       0.00      0.00      0.00         0
          15       0.70      0.86      0.77      1384

    accuracy                           0.76     13164
   macro avg       0.41      0.46      0.43     13164
weighted avg       0.79      0.76      0.77     13164

[[ 474   72    0   26    0   24    0   15    0    3]
 [ 129  837    1  171    0   16    0   62    0   15]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 106  202    1 13

In [42]:
class_report , conf_matrix = DT_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of Decision Tree Classifier on TF-IDF Vectorizer')
print(class_report)
print(conf_matrix)

Results of Decision Tree Classifier on TF-IDF Vectorizer
              precision    recall  f1-score   support

           0       0.34      0.45      0.39       727
           2       0.11      0.32      0.16       509
           4       0.00      0.00      0.00         0
           6       0.39      0.57      0.46      1173
           8       0.00      0.00      0.00         0
           9       0.16      0.78      0.26       384
          10       0.00      0.00      0.00         0
          11       0.92      0.51      0.66      9753
          12       1.00      1.00      1.00         1
          15       0.24      0.66      0.35       617

    accuracy                           0.52     13164
   macro avg       0.32      0.43      0.33     13164
weighted avg       0.76      0.52      0.58     13164

[[ 327  146    0   67    0  109    0   71    0    7]
 [  43  161    0  108    0   26    0  157    0   14]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 136  224    1  669    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
class_report , conf_matrix = XGB_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of Xgboost Classifier on TF-IDF Vectorizer')
print(class_report)
print(conf_matrix)

Results of Xgboost Classifier on TF-IDF Vectorizer
              precision    recall  f1-score   support

           0       0.11      0.81      0.20       132
           2       0.08      0.70      0.15       177
           4       0.00      0.00      0.00         0
           6       0.36      0.74      0.48       826
           8       0.00      0.00      0.00         0
           9       0.07      0.82      0.12       150
          10       0.00      0.00      0.00         0
          11       1.00      0.46      0.63     11605
          12       0.00      0.00      0.00         0
          15       0.15      0.91      0.25       274

    accuracy                           0.50     13164
   macro avg       0.18      0.44      0.18     13164
weighted avg       0.91      0.50      0.60     13164

[[ 107   17    0    8    0    0    0    0    0    0]
 [  25  124    0   28    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [  86  120    1  612    0    0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
class_report , conf_matrix = RF_classifier(train_vecs, Y_train, test_vecs, Y_test)
print('Results of Random Forest Classifier on TF-IDF Vectorizer')
print(class_report)
print(conf_matrix)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results of Random Forest Classifier on TF-IDF Vectorizer
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           6       0.00      0.60      0.00         5
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       1.00      0.41      0.58     13159
          12       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0

    accuracy                           0.41     13164
   macro avg       0.10      0.10      0.06     13164
weighted avg       1.00      0.41      0.58     13164

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   2    0    0    3    