In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df=pd.read_csv("tweets_lemmatized2.csv")

In [9]:
df['polarity'].value_counts()

Neutral     2201
Positive     940
Negative     863
Name: polarity, dtype: int64

In [10]:
X=df['content']
y=df['polarity']

In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,test_size=0.2,random_state=1)

In [12]:
vector=CountVectorizer()

In [13]:
X_train=vector.fit_transform(Xtrain)

In [14]:
X_test=vector.transform(Xtest)

# Multinomial NB

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
model=MultinomialNB()
model.fit(X_train,ytrain)
y_pred=model.predict(X_test)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

    Negative       0.58      0.51      0.55       169
     Neutral       0.73      0.87      0.79       454
    Positive       0.67      0.43      0.52       178

    accuracy                           0.69       801
   macro avg       0.66      0.60      0.62       801
weighted avg       0.69      0.69      0.68       801



# RandomForest

In [17]:
from sklearn.ensemble import RandomForestClassifier
rmodel=RandomForestClassifier()
rmodel.fit(X_train,ytrain)
ypred1=rmodel.predict(X_test)
print(classification_report(ytest,ypred1))

              precision    recall  f1-score   support

    Negative       0.54      0.29      0.38       169
     Neutral       0.69      0.88      0.78       454
    Positive       0.64      0.49      0.55       178

    accuracy                           0.67       801
   macro avg       0.62      0.55      0.57       801
weighted avg       0.65      0.67      0.64       801



In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
parameters = dict(n_estimators = [100,300,500,800,1200], max_depth = [5, 8, 15, 25, 30],  
              min_samples_split = [2, 5, 10, 15, 100], 
             min_samples_leaf = [1, 2, 5, 10])
gridmodelrf = GridSearchCV(rmodel, parameters, cv = 3,n_jobs=-1,scoring="f1_micro")
gridmodelrf.fit(X_train, ytrain)
grmodel=gridmodelrf.predict(X_test)
print('After optimization: '+ str(accuracy_score(ytest, grmodel)))
print(classification_report(ytest,grmodel))
print(gridmodelrf.best_params_)
print(gridmodelrf.best_score_)


After optimization: 0.6317103620474407
              precision    recall  f1-score   support

    Negative       0.75      0.11      0.19       169
     Neutral       0.61      0.98      0.75       454
    Positive       0.84      0.24      0.38       178

    accuracy                           0.63       801
   macro avg       0.74      0.44      0.44       801
weighted avg       0.69      0.63      0.55       801

{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


0.6109908888491073


# SVM

In [26]:
from sklearn.svm import SVC

smodel=SVC()
smodel.fit(X_train, ytrain)
ypred2=smodel.predict(X_test)
accuracy_score(ytest,ypred2)
print(classification_report(ytest,ypred2))

              precision    recall  f1-score   support

    Negative       0.53      0.34      0.41       169
     Neutral       0.69      0.91      0.78       454
    Positive       0.80      0.45      0.58       178

    accuracy                           0.68       801
   macro avg       0.67      0.56      0.59       801
weighted avg       0.68      0.68      0.66       801



In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
svc_parameters=dict(C= [0.1, 1, 10, 100, 1000],
              gamma= [1, 0.1, 0.01, 0.001, 0.0001],
              kernel= ['rbf'])
gridmodelsvc = GridSearchCV(smodel, svc_parameters, cv = 3,n_jobs=-1, scoring="f1_micro")
gridmodelsvc.fit(X_train, ytrain)
svcmodel=gridmodelsvc.predict(X_test)


print(gridmodelsvc.best_params_)
print('After optimization: '+ str(accuracy_score(ytest, svcmodel)))
print(classification_report(ytest,svcmodel))

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
After optimization: 0.6741573033707865
              precision    recall  f1-score   support

    Negative       0.51      0.41      0.46       169
     Neutral       0.74      0.83      0.78       454
    Positive       0.61      0.51      0.56       178

    accuracy                           0.67       801
   macro avg       0.62      0.59      0.60       801
weighted avg       0.66      0.67      0.66       801



# KNN

In [22]:
from sklearn.neighbors import KNeighborsClassifier
kmodel = KNeighborsClassifier()
kmodel.fit(X_train, ytrain)
ypred3=kmodel.predict(X_test)
accuracy_score(ytest,ypred3)
print(classification_report(ytest,ypred3))

              precision    recall  f1-score   support

    Negative       0.70      0.11      0.19       169
     Neutral       0.62      0.97      0.76       454
    Positive       0.80      0.29      0.43       178

    accuracy                           0.64       801
   macro avg       0.71      0.46      0.46       801
weighted avg       0.68      0.64      0.56       801



In [33]:
kn_parameters=dict(n_neighbors=[3,5,11,19,26,30,40,45,60,100],weights=['uniform','distance'],
                   metric=['euclidean','manhattan','cosine'])
gridmodelkn = GridSearchCV(kmodel, kn_parameters, cv = 3,n_jobs=-1,scoring="f1_micro")
gridmodelkn.fit(X_train, ytrain)

knmodel=gridmodelkn.predict(X_test)
print(gridmodelkn.best_params_)
print('After optimization: ' + str(accuracy_score(ytest, knmodel)))
print(classification_report(ytest,knmodel))

{'metric': 'cosine', 'n_neighbors': 3, 'weights': 'distance'}
After optimization: 0.6541822721598003
              precision    recall  f1-score   support

    Negative       0.54      0.30      0.38       169
     Neutral       0.69      0.87      0.77       454
    Positive       0.60      0.45      0.51       178

    accuracy                           0.65       801
   macro avg       0.61      0.54      0.55       801
weighted avg       0.64      0.65      0.63       801

