In [104]:
!pip install pandas
!pip install sklearn
!pip install matplotlib
!pip install keras



In [105]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC

In [106]:
tweets = pd.read_csv("sarcasm_dataset.csv")

tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0


In [107]:
# dop NaN columns
# tweets = tweets.dropna(any=[''])
tweets = tweets.drop(tweets.columns[0], axis=1)

tweets.tail()

Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
3463,The population spike in Chicago in 9 months is...,0,,,,,,
3464,You'd think in the second to last English clas...,0,,,,,,
3465,I’m finally surfacing after a holiday to Scotl...,0,,,,,,
3466,Couldn't be prouder today. Well done to every ...,0,,,,,,
3467,Overheard as my 13 year old games with a frien...,0,,,,,,


## Binary Classification

In [108]:
# dropping unnecessary columns for binary classification
tweets1 = tweets.drop(['sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question'], axis=1)
tweets1.dropna(how='any', inplace=True)
tweets1.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [109]:
# Convert label from float to int
def transform_float_to_int(value):
    return int(value)

tweets1['sarcastic'] = tweets1.sarcastic.apply(transform_float_to_int)

In [110]:
# Split Tweet and Sarcastic Column
X = tweets1['tweet']
y = tweets1['sarcastic']

In [111]:
# Split Train & Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, train_size=0.75)

# Apply Machine Learning Algorithm for Binary Classification Using BoW Row Count Ngram (Bigram & Trigram)

In [112]:
# BoW using Count Vertorizer using bigram
# ngram range from 2 to 3 (bigram & trigram)
cv = CountVectorizer(ngram_range=(2, 3), stop_words='english', binary='True')

X_train_dtm = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)

In [113]:
# Applying Logistic Regression
model = LogisticRegression()
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       640
           1       0.00      0.00      0.00       227

    accuracy                           0.74       867
   macro avg       0.37      0.50      0.42       867
weighted avg       0.54      0.74      0.63       867



In [114]:
# Applying SVM
model = SVC()
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))


              precision    recall  f1-score   support

           0       0.74      1.00      0.85       640
           1       0.00      0.00      0.00       227

    accuracy                           0.74       867
   macro avg       0.37      0.50      0.42       867
weighted avg       0.54      0.74      0.63       867



In [115]:
# Applying Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       640
           1       0.00      0.00      0.00       227

    accuracy                           0.74       867
   macro avg       0.37      0.50      0.42       867
weighted avg       0.54      0.74      0.63       867



In [116]:
# Applying Gaussian Naive Bayes Classifier
model = GaussianNB()
model.fit(X_train_dtm.toarray(), y_train)
y_pred = model.predict(X_test_dtm.toarray())
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.76      0.27      0.40       640
           1       0.27      0.76      0.40       227

    accuracy                           0.40       867
   macro avg       0.52      0.52      0.40       867
weighted avg       0.63      0.40      0.40       867



In [117]:
# Applying Preceptron Classifier
model = Perceptron(tol=1e-3, random_state=0)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.74      0.93      0.82       640
           1       0.26      0.07      0.11       227

    accuracy                           0.70       867
   macro avg       0.50      0.50      0.47       867
weighted avg       0.61      0.70      0.64       867



# Apply Machine Learning Algorithm for Binary Classification Using BoW TFIDF Ngram (Bigram & Trigram)

In [118]:
# BoW using Count Vertorizer using bigram
# ngram range from 2 to 3 (bigram & trigram)
tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english', binary=True, use_idf=True)

X_train_dtm = tfidf.fit_transform(X_train)
X_test_dtm = tfidf.transform(X_test)

In [119]:
# Applying Logistic Regression
model = LogisticRegression()
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       640
           1       0.00      0.00      0.00       227

    accuracy                           0.74       867
   macro avg       0.37      0.50      0.42       867
weighted avg       0.54      0.74      0.63       867



In [120]:
# Applying SVM
model = SVC()
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))


              precision    recall  f1-score   support

           0       0.74      1.00      0.85       640
           1       0.00      0.00      0.00       227

    accuracy                           0.74       867
   macro avg       0.37      0.50      0.42       867
weighted avg       0.54      0.74      0.63       867



In [121]:
# Applying Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       640
           1       0.25      0.00      0.01       227

    accuracy                           0.74       867
   macro avg       0.49      0.50      0.43       867
weighted avg       0.61      0.74      0.63       867



In [122]:
# Applying Gaussian Naive Bayes Classifier
model = GaussianNB()
model.fit(X_train_dtm.toarray(), y_train)
y_pred = model.predict(X_test_dtm.toarray())
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.76      0.27      0.40       640
           1       0.27      0.76      0.40       227

    accuracy                           0.40       867
   macro avg       0.52      0.52      0.40       867
weighted avg       0.63      0.40      0.40       867



In [123]:
# Applying Preceptron Classifier
model = Perceptron(tol=1e-3, random_state=0)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.74      0.90      0.81       640
           1       0.24      0.09      0.13       227

    accuracy                           0.69       867
   macro avg       0.49      0.49      0.47       867
weighted avg       0.61      0.69      0.63       867



# Preparing Data for multi class classification

In [124]:
tweets_multi = tweets.copy()
tweets_multi = tweets_multi[tweets.sarcastic == 1]
tweets_multi.head()

Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0


In [125]:
def convert_col_to_label(row):
    if row.irony == 1.0:
        return 'irony'
    if row.satire == 1.0:
        return 'satire';
    if row.understatement == 1.0:
        return 'understatement'
    if row.overstatement == 1.0:
        return 'overstatement'
    if row.rhetorical_question == 1.0:
        return 'rhetorical_question'
    return 'Other'

tweets_multi['label'] = tweets_multi.apply(lambda row: convert_col_to_label(row), axis=1)

In [126]:
tweets_multi = tweets_multi.drop(['sarcasm', 'sarcastic', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question'], axis=1)

In [127]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
tweets_multi['label'] = labelencoder.fit_transform(tweets_multi['label'])

# Apply Machine Learning Algorithm for Multi Class Classification Using BoW Row Count Ngram (Bigram & Trigram)

In [128]:
X = tweets_multi['tweet']
y = tweets_multi['label']

In [129]:
# Split Train & Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, train_size=0.75)

In [130]:
# BoW using Count Vertorizer using bigram
# ngram range from 2 to 3 (bigram & trigram)
cv = CountVectorizer(ngram_range=(2, 3), stop_words='english')

X_train_dtm = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)

In [131]:
# Applying Logistic Regression
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [132]:
# Applying SVM
model = SVC(kernel='poly', C=1.0)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [133]:
# Applying Random Forest Classifier
model = RandomForestClassifier(n_estimators = 10, max_depth=3, criterion = 'entropy')
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [134]:
# Applying Multinomial Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_dtm.toarray(), y_train)
y_pred = model.predict(X_test_dtm.toarray())
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [135]:
# Applying Preceptron Classifier
model = Perceptron(tol=1e-3, random_state=10)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.50      0.10      0.17        39
           0       0.68      0.99      0.81       143
           3       0.50      0.04      0.08        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.67       217
   macro avg       0.28      0.19      0.18       217
weighted avg       0.59      0.67      0.57       217



# Apply Machine Learning Algorithm for Multi Class Classification Using BoW TFIDF Ngram (Bigram & Trigram)

In [136]:
# BoW using TFIDF using bigram
# ngram range from 2 to 3 (bigram & trigram)
tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english', use_idf=True)

X_train_dtm = tfidf.fit_transform(X_train)
X_test_dtm = tfidf.transform(X_test)

In [137]:
# Applying Logistic Regression
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [138]:
# Applying SVM
model = SVC(kernel='poly', C=1.0)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [139]:
# Applying Random Forest Classifier
model = RandomForestClassifier(n_estimators = 10, max_depth=3, criterion = 'entropy')
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [140]:
# Applying Multinomial Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_dtm.toarray(), y_train)
y_pred = model.predict(X_test_dtm.toarray())
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        39
           0       0.66      1.00      0.79       143
           3       0.00      0.00      0.00        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.66       217
   macro avg       0.11      0.17      0.13       217
weighted avg       0.43      0.66      0.52       217



In [141]:
# Applying Preceptron Classifier
model = Perceptron(tol=1e-3, random_state=10)
model.fit(X_train_dtm, y_train)
y_pred = model.predict(X_test_dtm)
print(classification_report(y_test, y_pred, zero_division=0, labels=tweets_multi.label.unique()))

              precision    recall  f1-score   support

           1       0.44      0.10      0.17        39
           0       0.69      0.98      0.81       143
           3       0.33      0.04      0.08        23
           5       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         8

    accuracy                           0.67       217
   macro avg       0.24      0.19      0.18       217
weighted avg       0.57      0.67      0.57       217

