In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Kaggle Dataset - https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection

df = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines = True)

In [None]:
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [None]:
df.drop(columns = ['article_link'], inplace=True)

In [None]:
df

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [None]:
df['is_sarcastic'].value_counts()

Unnamed: 0_level_0,count
is_sarcastic,Unnamed: 1_level_1
0,14985
1,13634


In [None]:
df_majority = df[df['is_sarcastic'] == 0]
df_minority = df[df['is_sarcastic'] == 1]

In [None]:
df_minority_upsampled = resample(df_minority,
                                replace = True,
                                n_samples = len(df_majority),
                                random_state = 42)

In [None]:
df = pd.concat([df_majority, df_minority_upsampled])

In [None]:
df['is_sarcastic'].value_counts()

Unnamed: 0_level_0,count
is_sarcastic,Unnamed: 1_level_1
0,14985
1,14985


In [None]:
df['processed_headline1'] = df['headline'].str.replace(r'[^a-zA-Z]', ' ', regex = True)

In [None]:
df['processed_headline2'] = df['processed_headline1'].str.replace(r'[\s]+', ' ', regex = True)

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  tokens = word_tokenize(text)
  filtered_tokens = []
  for token in tokens:
    if token.lower() not in stop_words:
      filtered_tokens.append(token)
  return ' '.join(filtered_tokens)

In [None]:
df['processed_headline3'] = df['processed_headline2'].apply(remove_stopwords)

In [None]:
def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(lemmatized_tokens)

In [None]:
df['final_headline'] = df['processed_headline3'].apply(lemmatize)

In [None]:
count_vectorizer = CountVectorizer()
count_vectorized_text = count_vectorizer.fit_transform(df['final_headline'])

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorized_text = tfidf_vectorizer.fit_transform(df['final_headline'])

In [None]:
vectorized_text = hstack((count_vectorized_text, tfidf_vectorized_text))

In [None]:
X = vectorized_text
y = df['is_sarcastic']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
linear_svc = LinearSVC()
svc_model = linear_svc.fit(X_train, y_train)

In [None]:
y_pred = svc_model.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8518518518518519
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3013
           1       0.84      0.87      0.85      2981

    accuracy                           0.85      5994
   macro avg       0.85      0.85      0.85      5994
weighted avg       0.85      0.85      0.85      5994



In [None]:
rf_clf = RandomForestClassifier()
rf_model = rf_clf.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8600266933600267
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      3013
           1       0.87      0.84      0.86      2981

    accuracy                           0.86      5994
   macro avg       0.86      0.86      0.86      5994
weighted avg       0.86      0.86      0.86      5994



In [None]:
svc_rbf = SVC(kernel='rbf')

svc_rbf_model = svc_rbf.fit(X_train, y_train)
y_pred = svc_rbf_model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8601935268601936
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      3013
           1       0.88      0.84      0.86      2981

    accuracy                           0.86      5994
   macro avg       0.86      0.86      0.86      5994
weighted avg       0.86      0.86      0.86      5994



In [None]:
deep_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation = 'relu' ),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(2, activation = 'sigmoid')
])

In [None]:
deep_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
deep_layer_model = deep_model.fit(X_train, y_train, epochs = 10, validation_split = 0.2, batch_size = 32)

Epoch 1/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 67ms/step - accuracy: 0.6982 - loss: 0.5468 - val_accuracy: 0.8284 - val_loss: 0.3845
Epoch 2/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 59ms/step - accuracy: 0.9381 - loss: 0.1615 - val_accuracy: 0.8432 - val_loss: 0.4158
Epoch 3/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 59ms/step - accuracy: 0.9929 - loss: 0.0255 - val_accuracy: 0.8415 - val_loss: 0.7501
Epoch 4/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 60ms/step - accuracy: 0.9993 - loss: 0.0025 - val_accuracy: 0.8472 - val_loss: 1.1346
Epoch 5/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 57ms/step - accuracy: 0.9998 - loss: 4.3416e-04 - val_accuracy: 0.8376 - val_loss: 1.4065
Epoch 6/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 58ms/step - accuracy: 0.9996 - loss: 0.0013 - val_accuracy: 0.8445 - val_loss: 1.3462
Epoch 7/10


In [None]:
y_pred = deep_model.predict(X_test)

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [None]:
print(accuracy_score(y_test, np.argmax((y_pred), axis = 1)))
print(classification_report(y_test, np.argmax((y_pred), axis = 1)))

0.8480146813480147
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      3013
           1       0.85      0.84      0.85      2981

    accuracy                           0.85      5994
   macro avg       0.85      0.85      0.85      5994
weighted avg       0.85      0.85      0.85      5994

