# Sentiment Analysis for Arabic Tweets



-------------------------------------------------------------

First, import needed packages

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

-------------------------------------------------------------
Load the data, and then check sample rows

In [3]:
data = pd.read_csv("DataClasified.csv")

In [4]:
data.head()

Unnamed: 0,Text,Sentiment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,انشاء الله هنعمل حاجه,1,,,,,,,,,...,,,,,,,,,,
1,اقسم باللله ان العرب اكثر الشعوب تخلفاا,0,,,,,,,,,...,,,,,,,,,,
2,﻿هات ناس تفهم .. و المثل بحكي اسأل مجرب و لا ت...,0,,,,,,,,,...,,,,,,,,,,
3,صرماتي براس اهلك,0,,,,,,,,,...,,,,,,,,,,
4,حرام السخرية من الناس,0,,,,,,,,,...,,,,,,,,,,



-------------------------------------------------------------

-This split function is used to split the data into training and test splits using conventional 0.3 vs 0.7 
-Split the data
-Then check the sample per class to verify consistency

In [5]:
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    X_train =  data[:n].copy()
    X_test =   data[n:].copy()
    y_train = y[:n].copy()
    y_test  = y[n:].copy()
    return X_train,X_test,y_train,y_test


X_train,X_test,y_train,y_test = simple_split(data.Text,data.Sentiment,len(data))
print("Samples per class: {}".format(np.bincount(y_train)))
print("Samples per class: {}".format(np.bincount(y_test)))

Samples per class: [1217 1262]
Samples per class: [554 509]



-------------------------------------------------------------

Using CountVectorizer to create the bag of words representation, then check it.

In [6]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print("Vocabulary size: {}".format(len(vectorizer.vocabulary_)))
print("Vocabulary content:\n {}".format(vectorizer.vocabulary_))

Vocabulary size: 10589
Vocabulary content:
 {'انشاء': 3042, 'الله': 2484, 'هنعمل': 8782, 'حاجه': 4740, 'اقسم': 996, 'باللله': 3368, 'ان': 2993, 'العرب': 2196, 'اكثر': 1014, 'الشعوب': 2033, 'تخلفاا': 4180, 'هات': 8666, 'ناس': 8407, 'تفهم': 4391, 'المثل': 2549, 'بحكي': 3545, 'اسأل': 743, 'مجرب': 7836, 'لا': 7183, 'تسأل': 4247, 'خبير': 4995, 'صرماتي': 6005, 'براس': 3618, 'اهلك': 3110, 'حرام': 4805, 'السخرية': 1925, 'من': 8258, 'الناس': 2785, 'طوقان': 6169, 'بدور': 3590, 'منصب': 8283, 'يضل': 10236, 'فيه': 6811, 'عشر': 6354, 'سنين': 5697, 'لقدام': 7403, 'لاسئلان': 7210, 'في': 6783, 'نووي': 8649, 'سلامة': 5642, 'الاردن': 1211, 'عمرتنا': 6449, 'كانت': 7011, 'مميزة': 8256, 'جدا': 4624, 'ورائعة': 9408, 'يمكن': 10412, 'جايين': 4616, 'سكارى': 5631, 'ويغنون': 9842, 'يا': 9871, 'سلام': 5640, 'ما': 7663, 'أحلى': 110, 'أن': 292, 'يواسي': 10466, 'المسلم': 2633, 'أخاه': 120, 'هذه': 8741, 'هي': 8809, 'أخلاق': 134, 'الإسلام': 1142, 'التي': 1542, 'أصبحنا': 200, 'نفتقدها': 8587, 'يوما': 10491, 'بعد': 3724,


-------------------------------------------------------------

Try LogisticRegression and and print the score and confusion matrix

In [25]:
logreg = LogisticRegression(C=10000)
logreg.fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))
pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion matrix:\n{}".format(confusion))

Training set score: 1.000
Test set score: 0.688
Confusion matrix:
[[394 160]
 [172 337]]



-------------------------------------------------------------

Try Multinomial Naive Bayes and and print the score and confusion matrix

In [42]:
nb = MultinomialNB(alpha=0.7, class_prior=None, fit_prior=True)
nb.fit(X_train, y_train)
print("Training set score: {:.5f}".format(nb.score(X_train, y_train)))
print("Test set score: {:.5f}".format(nb.score(X_test, y_test)))
pred_nb = nb.predict(X_test)
confusion = confusion_matrix(y_test, pred_nb)
print("Confusion matrix:\n{}".format(confusion))

Training set score: 0.98225
Test set score: 0.73565
Confusion matrix:
[[414 140]
 [141 368]]




-------------------------------------------------------------

Try RandomForestClassifier and and print the score and confusion matrix

In [92]:
rf = RandomForestClassifier(n_estimators=500,n_jobs=3, criterion= "entropy")
rf.fit(X_train, y_train)
print("Training set score: {:.5f}".format(rf.score(X_train, y_train)))
print("Test set score: {:.5f}".format(rf.score(X_test, y_test)))
pred_rf = rf.predict(X_test)
confusion = confusion_matrix(y_test, pred_rf)
print("Confusion matrix:\n{}".format(confusion))

Training set score: 0.99960
Test set score: 0.67827
Confusion matrix:
[[464  90]
 [252 257]]


-------------------------------------------------------------

Now testing the 3 classifiers with positive samples (result should be 1)

In [93]:
tweet = "كل سنة وانت طيب"
print(logreg.predict(vectorizer.transform([tweet]))[0])

print(nb.predict(vectorizer.transform([tweet]))[0])

print(rf.predict(vectorizer.transform([tweet]))[0])

1
1
1


-------------------------------------------------------------

Now testing the 3 classifiers with negative samples (result should be 0)

In [98]:
tweet = "لا اوافق على هذا الرأي"
print(logreg.predict(vectorizer.transform([tweet]))[0])

print(nb.predict(vectorizer.transform([tweet]))[0])

print(rf.predict(vectorizer.transform([tweet]))[0])

0
0
0


==========================================================================

Now you can test yourself

In [103]:
tweet = input("Enter Tweet:")

print("Sentiment LogisticRegression", logreg.predict(vectorizer.transform([tweet]))[0])

print("Sentiment MultinomialNB", nb.predict(vectorizer.transform([tweet]))[0])

print("Sentiment RandomForestClassifier", rf.predict(vectorizer.transform([tweet]))[0])

Enter Tweet:نقد غير بناء
Sentiment LogisticRegression 1
Sentiment MultinomialNB 0
Sentiment RandomForestClassifier 0
