In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

In [None]:
train_data = pd.read_csv('/content/preprocessed_1.csv')
train_data.head()

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data['tweet'], train_data['sarcasm'], test_size =0.2, random_state=100)

In [60]:
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(10038,) (2510,) (10038,) (2510,)


In [37]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [70]:
pipe = make_pipeline(TfidfVectorizer(),
                    MultinomialNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.82
              precision    recall  f1-score   support

       False       0.82      1.00      0.90      2060
        True       0.00      0.00      0.00       450

    accuracy                           0.82      2510
   macro avg       0.41      0.50      0.45      2510
weighted avg       0.67      0.82      0.74      2510



In [71]:
pipe = make_pipeline(TfidfVectorizer(),
                    BernoulliNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.82
              precision    recall  f1-score   support

       False       0.82      1.00      0.90      2060
        True       0.62      0.01      0.02       450

    accuracy                           0.82      2510
   macro avg       0.72      0.50      0.46      2510
weighted avg       0.79      0.82      0.74      2510



In [72]:
pipe = make_pipeline(TfidfVectorizer(),
                    ComplementNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.83
              precision    recall  f1-score   support

       False       0.85      0.96      0.90      2060
        True       0.55      0.22      0.32       450

    accuracy                           0.83      2510
   macro avg       0.70      0.59      0.61      2510
weighted avg       0.80      0.83      0.80      2510



In [73]:
pipe_svm = make_pipeline(TfidfVectorizer(),
                     SVC())
param_grid = {'svc__kernel': ['rbf', 'linear', 'poly'],
             'svc__gamma': [0.1, 1, 10, 100],
             'svc__C': [0.1, 1, 10, 100]}

svc_model = GridSearchCV(pipe_svm, param_grid, cv=3)
svc_model.fit(X_train, Y_train)

prediction = svc_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.83
              precision    recall  f1-score   support

       False       0.84      0.98      0.91      2060
        True       0.63      0.15      0.24       450

    accuracy                           0.83      2510
   macro avg       0.73      0.56      0.57      2510
weighted avg       0.80      0.83      0.79      2510



In [74]:
pipe_rf = make_pipeline(TfidfVectorizer(),
                    RandomForestClassifier())

param_grid = {'randomforestclassifier__n_estimators':[10, 100, 1000],
             'randomforestclassifier__max_features':['sqrt', 'log2']}

rf_model = GridSearchCV(pipe_rf, param_grid, cv=5)
rf_model.fit(X_train,Y_train)

prediction = rf_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")

Accuracy score is 0.83


In [75]:
print(classification_report(Y_test, prediction))

              precision    recall  f1-score   support

       False       0.83      0.99      0.90      2060
        True       0.63      0.08      0.14       450

    accuracy                           0.83      2510
   macro avg       0.73      0.53      0.52      2510
weighted avg       0.80      0.83      0.77      2510

