<a href="https://colab.research.google.com/github/AmeyHengle/Arabic-Sentiment-Identification/blob/main/test/Sarcasm_ML_BOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data = pd.read_csv('/content/preprocessed_1.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcasm,sentiment,dialect
0,0,محمودالعلايلياري الفريق احمدشفيق رقم مهم المعا...,False,NEU,msa
1,1,فيدرر يا اجا والكبار 😍,False,NEU,msa
2,2,الداعون لمبدا الاختلاط الجنسين كالداعين لالغاء...,True,NEG,msa
3,3,مساكين الصبح هوما رايحين راجعين عاي غوغل تعبت ...,True,NEG,gulf
4,4,قل شرق حلب تقل حلب الشرقيه وقل غرب حلب تقل حلب...,False,NEU,msa


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data['tweet'], train_data['sarcasm'], test_size =0.2, random_state=100)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)   #DO NOT RUN texts_to_sequences TWICE.
X_train = pad_sequences(X_train, padding='post')

In [None]:
type(X_train)

numpy.ndarray

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
X_test = pad_sequences(X_test, maxlen=49, padding='post')   #DO NOT RUN texts_to_sequences TWICE.

In [None]:
X_train.shape

(10038, 49)

In [None]:
#X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(10038, 49)
(10038,)
(2510, 49)
(2510,)


In [None]:
X_train[0]

array([13696,   694,  3197,  1913,   118, 13697,  7723,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)

In [None]:
m_nb = make_pipeline(MultinomialNB())
m_nb.fit(X_train, Y_train)
pred = m_nb.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, pred):.2f}")
print(classification_report(Y_test, pred))

Accuracy score is 0.77
              precision    recall  f1-score   support

       False       0.83      0.92      0.87      2060
        True       0.23      0.12      0.16       450

    accuracy                           0.77      2510
   macro avg       0.53      0.52      0.51      2510
weighted avg       0.72      0.77      0.74      2510



In [None]:
b_nb = make_pipeline(BernoulliNB())
b_nb.fit(X_train, Y_train)
pred = b_nb.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, pred):.2f}")
print(classification_report(Y_test, pred))

Accuracy score is 0.79
              precision    recall  f1-score   support

       False       0.83      0.94      0.88      2060
        True       0.30      0.12      0.17       450

    accuracy                           0.79      2510
   macro avg       0.57      0.53      0.53      2510
weighted avg       0.74      0.79      0.75      2510



In [None]:
c_nb = make_pipeline(ComplementNB()) 
c_nb.fit(X_train, Y_train)
pred = c_nb.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, pred):.2f}")
print(classification_report(Y_test, pred))

Accuracy score is 0.77
              precision    recall  f1-score   support

       False       0.83      0.92      0.87      2060
        True       0.23      0.12      0.16       450

    accuracy                           0.77      2510
   macro avg       0.53      0.52      0.51      2510
weighted avg       0.72      0.77      0.74      2510



In [None]:
pipe_svm = make_pipeline(
                     SVC())
param_grid = {'svc__kernel': ['rbf', 'linear', 'poly'],
             'svc__gamma': [0.1, 1, 10, 100],
             'svc__C': [0.1, 1, 10, 100]}

svc_model = GridSearchCV(pipe_svm, param_grid, cv=3)
svc_model.fit(X_train, Y_train)

prediction = svc_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

In [None]:
svm = make_pipeline((SVC(C=1, kernel='poly', gamma=1)))
svm.fit(X_train, Y_train)
pred = svm.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

In [None]:
pipe_rf = make_pipeline(
                    RandomForestClassifier())

param_grid = {'randomforestclassifier__n_estimators':[10, 100, 1000],
             'randomforestclassifier__max_features':['sqrt', 'log2']}

rf_model = GridSearchCV(pipe_rf, param_grid, cv=5)
rf_model.fit(X_train,Y_train)

prediction = rf_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))