<a href="https://colab.research.google.com/github/AmeyHengle/Arabic-Sentiment-Identification/blob/main/Basic%20ML%20Models/Sentiment_ML_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

In [None]:
train_data = pd.read_csv('/content/preprocessed_1.csv')
train_data.head(10)

Unnamed: 0.1,Unnamed: 0,tweet,sarcasm,sentiment,dialect
0,0,محمودالعلايلياري الفريق احمدشفيق رقم مهم المعا...,False,NEU,msa
1,1,فيدرر يا اجا والكبار 😍,False,NEU,msa
2,2,الداعون لمبدا الاختلاط الجنسين كالداعين لالغاء...,True,NEG,msa
3,3,مساكين الصبح هوما رايحين راجعين عاي غوغل تعبت ...,True,NEG,gulf
4,4,قل شرق حلب تقل حلب الشرقيه وقل غرب حلب تقل حلب...,False,NEU,msa
5,5,طبيب المقاصه اصابه احمدالشيخ بسيطه تحتاج للراحه,False,NEG,msa
6,6,مرسي مش هنام اكتر اربع ساعات نمت دقيقه كمان زي...,True,NEG,egypt
7,7,انتخبواالبرص مشكله,True,NEG,egypt
8,8,Retweeted هاشتاق العرب فيديو اهالي القدس يرفعو...,False,POS,msa
9,9,▪️محافظ العاصمه تفقد سير العمليه الانتخابيه لج...,False,NEU,msa


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data['tweet'], train_data['sentiment'], test_size =0.2, random_state=100)

In [4]:
X_train = np.array(X_train)#.reshape((X_train.shape[0],1))
X_test = np.array(X_test)#.reshape((X_test.shape[0],1))
Y_train = np.array(Y_train)#.reshape((Y_train.shape[0],1))
Y_test = np.array(Y_test)#.reshape((Y_test.shape[0],1))
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(10038,) (2510,) (10038,) (2510,)


In [5]:
pipe = make_pipeline(TfidfVectorizer(),
                    MultinomialNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.63
              precision    recall  f1-score   support

         NEG       0.65      0.61      0.63       908
         NEU       0.62      0.85      0.72      1167
         POS       0.75      0.05      0.09       435

    accuracy                           0.63      2510
   macro avg       0.67      0.51      0.48      2510
weighted avg       0.65      0.63      0.58      2510



In [6]:
pipe = make_pipeline(TfidfVectorizer(),
                    BernoulliNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.58
              precision    recall  f1-score   support

         NEG       0.77      0.39      0.52       908
         NEU       0.54      0.95      0.69      1167
         POS       0.33      0.01      0.01       435

    accuracy                           0.58      2510
   macro avg       0.55      0.45      0.41      2510
weighted avg       0.59      0.58      0.51      2510



In [7]:
pipe = make_pipeline(TfidfVectorizer(),
                    ComplementNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.67
              precision    recall  f1-score   support

         NEG       0.65      0.69      0.67       908
         NEU       0.70      0.75      0.73      1167
         POS       0.57      0.38      0.46       435

    accuracy                           0.67      2510
   macro avg       0.64      0.61      0.62      2510
weighted avg       0.66      0.67      0.66      2510



In [8]:
pipe_svm = make_pipeline(TfidfVectorizer(),
                     SVC())
param_grid = {'svc__kernel': ['rbf', 'linear', 'poly'],
             'svc__gamma': [0.1, 1, 10, 100],
             'svc__C': [0.1, 1, 10, 100]}

svc_model = GridSearchCV(pipe_svm, param_grid, cv=3)
svc_model.fit(X_train, Y_train)

prediction = svc_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.65
              precision    recall  f1-score   support

         NEG       0.64      0.69      0.66       908
         NEU       0.68      0.76      0.72      1167
         POS       0.59      0.31      0.40       435

    accuracy                           0.65      2510
   macro avg       0.63      0.58      0.59      2510
weighted avg       0.65      0.65      0.64      2510



In [9]:
pipe_rf = make_pipeline(TfidfVectorizer(),
                    RandomForestClassifier())

param_grid = {'randomforestclassifier__n_estimators':[10, 100, 1000],
             'randomforestclassifier__max_features':['sqrt', 'log2']}

rf_model = GridSearchCV(pipe_rf, param_grid, cv=5)
rf_model.fit(X_train,Y_train)

prediction = rf_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.62
              precision    recall  f1-score   support

         NEG       0.65      0.52      0.58       908
         NEU       0.61      0.83      0.70      1167
         POS       0.62      0.28      0.38       435

    accuracy                           0.62      2510
   macro avg       0.62      0.54      0.55      2510
weighted avg       0.62      0.62      0.60      2510

