In [43]:

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [44]:
df=pd.read_csv("annotated_tweets_afghans_lemmatized.csv")

In [45]:
df['Sentiment'].value_counts()

Neutral     894
Negative    521
Positive     85
Name: Sentiment, dtype: int64

In [46]:
X=df['Tweet']
y=df['Sentiment']

In [47]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,test_size=0.2,stratify=y,random_state=1)

In [48]:
ytest

987      Neutral
859     Positive
924      Neutral
318     Negative
68       Neutral
          ...   
564     Negative
263      Neutral
693      Neutral
1261     Neutral
1105     Neutral
Name: Sentiment, Length: 300, dtype: object

In [49]:
vector=CountVectorizer()

In [50]:
X_train=vector.fit_transform(Xtrain)

In [51]:
X_test=vector.transform(Xtest)

In [52]:
#MULTINOMIAL NAIVE BAYES

In [53]:
from sklearn.naive_bayes import MultinomialNB


In [38]:
model=MultinomialNB()
model.fit(X_train,ytrain)
y_pred=model.predict(X_test)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

    Negative       0.62      0.66      0.64       104
     Neutral       0.76      0.80      0.78       179
    Positive       0.00      0.00      0.00        17

    accuracy                           0.71       300
   macro avg       0.46      0.49      0.47       300
weighted avg       0.67      0.71      0.69       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
from sklearn.ensemble import RandomForestClassifier
rmodel=RandomForestClassifier()
rmodel.fit(X_train,ytrain)
ypred1=rmodel.predict(X_test)
print(classification_report(ytest,ypred1))

              precision    recall  f1-score   support

    Negative       0.70      0.38      0.49       104
     Neutral       0.68      0.93      0.79       179
    Positive       1.00      0.06      0.11        17

    accuracy                           0.69       300
   macro avg       0.79      0.45      0.46       300
weighted avg       0.71      0.69      0.64       300



In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
parameters = dict(n_estimators = [100,300,500,800,1200], max_depth = [5, 8, 15, 25, 30],  
              min_samples_split = [2, 5, 10, 15, 100], 
             min_samples_leaf = [1, 2, 5, 10])
gridmodelrf = GridSearchCV(rmodel, parameters, cv = 3,n_jobs=-1,scoring="f1_macro")
gridmodelrf.fit(X_train, ytrain)
grmodel=gridmodelrf.predict(X_test)
print('After optimization: '+ str(accuracy_score(ytest, grmodel)))
print(classification_report(ytest,grmodel))
print(gridmodelrf.best_params_)
print(gridmodelrf.best_score_)

After optimization: 0.6633333333333333
              precision    recall  f1-score   support

    Negative       0.78      0.24      0.37       104
     Neutral       0.65      0.97      0.78       179
    Positive       0.00      0.00      0.00        17

    accuracy                           0.66       300
   macro avg       0.48      0.40      0.38       300
weighted avg       0.66      0.66      0.59       300

{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
0.3463582390646967


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
from sklearn.svm import SVC
smodel=SVC()
smodel.fit(X_train, ytrain)
ypred2=smodel.predict(X_test)
accuracy_score(ytest,ypred2)
print(classification_report(ytest,ypred2))

              precision    recall  f1-score   support

    Negative       0.71      0.38      0.50       104
     Neutral       0.69      0.94      0.79       179
    Positive       0.00      0.00      0.00        17

    accuracy                           0.69       300
   macro avg       0.47      0.44      0.43       300
weighted avg       0.66      0.69      0.65       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
from sklearn.neighbors import KNeighborsClassifier
kmodel = KNeighborsClassifier()
kmodel.fit(X_train, ytrain)
ypred3=kmodel.predict(X_test)
accuracy_score(ytest,ypred3)
print(classification_report(ytest,ypred3))

              precision    recall  f1-score   support

    Negative       0.80      0.08      0.14       104
     Neutral       0.61      0.99      0.75       179
    Positive       0.00      0.00      0.00        17

    accuracy                           0.62       300
   macro avg       0.47      0.36      0.30       300
weighted avg       0.64      0.62      0.50       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
from sklearn.linear_model import LogisticRegression

In [54]:
log_reg = LogisticRegression()

log_reg.fit(X_train, ytrain)
ypred4 = log_reg.predict(X_test)
accuracy_score(ytest,ypred4)
print(classification_report(ytest,ypred4))

              precision    recall  f1-score   support

    Negative       0.61      0.52      0.56       104
     Neutral       0.72      0.84      0.78       179
    Positive       0.33      0.06      0.10        17

    accuracy                           0.68       300
   macro avg       0.55      0.47      0.48       300
weighted avg       0.66      0.68      0.66       300

