In [None]:
# Importing all the necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
# Loading the dataset (JSON format) and reading it into a Pandas DataFrame incorporating the file path
df = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines = True) # Dataset Link - https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection

In [None]:
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [None]:
# Removing the unnecessary column
df.drop(columns = ['article_link'], inplace=True)

In [None]:
df

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [None]:
# Checking whether the dataset is balanced
df['is_sarcastic'].value_counts()

Unnamed: 0_level_0,count
is_sarcastic,Unnamed: 1_level_1
0,14985
1,13634


In [None]:
# Balacing the data using oversampling
df_majority = df[df['is_sarcastic'] == 0]
df_minority = df[df['is_sarcastic'] == 1]

In [None]:
df_minority_upsampled = resample(df_minority,
                                replace = True,
                                n_samples = len(df_majority),
                                random_state = 42)

In [None]:
df = pd.concat([df_majority, df_minority_upsampled])

In [None]:
df['is_sarcastic'].value_counts()

Unnamed: 0_level_0,count
is_sarcastic,Unnamed: 1_level_1
0,14985
1,14985


In [None]:
# Replacing the characters that are not letters (a to z and A to Z) with a space
df['processed_headline1'] = df['headline'].str.replace(r'[^a-zA-Z]', ' ', regex = True)

In [None]:
# Replacing one or more spaces with a single space.
df['processed_headline2'] = df['processed_headline1'].str.replace(r'[\s]+', ' ', regex = True)

In [None]:
# Executing POS Tagging
def pos_tagging(text):
  tokens = word_tokenize(text)
  pos_tagged_tokens = pos_tag(tokens)
  pos_tagged_text = []
  for tag in pos_tagged_tokens:
    pos_tagged_text.append('/'.join(tag))
  return ' '.join(pos_tagged_text)

df['processed_headline3'] = df['processed_headline2'].apply(pos_tagging)

In [None]:
# Removing the Stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  tokens = word_tokenize(text)
  filtered_tokens = []
  for token in tokens:
    if token.lower() not in stop_words:
      filtered_tokens.append(token)
  return ' '.join(filtered_tokens)

In [None]:
df['processed_headline4'] = df['processed_headline2'].apply(remove_stopwords)

In [None]:
# Employing Lemmatization
def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(lemmatized_tokens)

In [None]:
df['final_headline'] = df['processed_headline4'].apply(lemmatize)

In [None]:
# Applying Count Vectorization
count_vectorizer = CountVectorizer()
count_vectorized_text = count_vectorizer.fit_transform(df['final_headline'])

In [None]:
# Processing TFIDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorized_text = tfidf_vectorizer.fit_transform(df['final_headline'])

In [None]:
# Performing Word2Vec Embeddings
tokenized_text = []
for text in df['final_headline']:
    tokenized_text.append(word_tokenize(text))

word2vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

def get_document_embedding(text):
    tokens = word_tokenize(text)
    embeddings = []
    for token in tokens:
        if token in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

document_embeddings = []
for text in df['final_headline']:
    document_embeddings.append(get_document_embedding(text))

word2vec_features = np.array(document_embeddings)

In [None]:
# Integrating the functionality of Count Vectorization, TFIDF Vectorization, and Word2Vec Embeddings to achieve higher performance and accuracy using hstack
vectorized_text = hstack((count_vectorized_text, tfidf_vectorized_text, word2vec_features))

In [None]:
X = vectorized_text
y = df['is_sarcastic']

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Implementing a Linear SVC Model
linear_svc = LinearSVC()
svc_model = linear_svc.fit(X_train, y_train)

y_pred = svc_model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.851685018351685
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3013
           1       0.83      0.88      0.85      2981

    accuracy                           0.85      5994
   macro avg       0.85      0.85      0.85      5994
weighted avg       0.85      0.85      0.85      5994



In [None]:
# Implementing an SVC model with kernel set to 'rbf'
svc_rbf = SVC(kernel='rbf')

svc_rbf_model = svc_rbf.fit(X_train, y_train)
y_pred = svc_rbf_model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8478478478478478
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      3013
           1       0.87      0.82      0.84      2981

    accuracy                           0.85      5994
   macro avg       0.85      0.85      0.85      5994
weighted avg       0.85      0.85      0.85      5994



In [None]:
# Implementing a Deep Learning Model using Dense Layers
deep_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation = 'relu' ),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(2, activation = 'sigmoid')
])

In [None]:
# Deep Learning Model compilation
deep_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
deep_layer_model = deep_model.fit(X_train, y_train, epochs = 10, validation_split = 0.2, batch_size = 32)

Epoch 1/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 84ms/step - accuracy: 0.6871 - loss: 0.5573 - val_accuracy: 0.8144 - val_loss: 0.3983
Epoch 2/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 82ms/step - accuracy: 0.9183 - loss: 0.2032 - val_accuracy: 0.8301 - val_loss: 0.4256
Epoch 3/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 84ms/step - accuracy: 0.9618 - loss: 0.0972 - val_accuracy: 0.8347 - val_loss: 0.5329
Epoch 4/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 81ms/step - accuracy: 0.9865 - loss: 0.0394 - val_accuracy: 0.8261 - val_loss: 0.6584
Epoch 5/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 83ms/step - accuracy: 0.9940 - loss: 0.0198 - val_accuracy: 0.8249 - val_loss: 0.7483
Epoch 6/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 81ms/step - accuracy: 0.9951 - loss: 0.0159 - val_accuracy: 0.8326 - val_loss: 0.9853
Epoch 7/10
[1m6

In [None]:
y_pred = deep_model.predict(X_test)

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step


In [None]:
print(accuracy_score(y_test, np.argmax((y_pred), axis = 1)))
print(classification_report(y_test, np.argmax((y_pred), axis = 1)))

0.8451785118451786
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      3013
           1       0.86      0.82      0.84      2981

    accuracy                           0.85      5994
   macro avg       0.85      0.85      0.85      5994
weighted avg       0.85      0.85      0.85      5994



In [None]:
# Installing and importing the necessary packages for making a prediction on unseen data
!pip install newspaper3k
!pip install lxml_html_clean

from newspaper import Article

url = 'https://theonion.com/world-death-rate-holding-steady-at-100-percent-1819564171/'

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


In [None]:
article = Article(url)

In [None]:
# Downloading, parsing the article, and performing NLP task
article.download()
article.parse()
article.nlp()

In [None]:
# Retrieving the headline from the news article
Test = article.title

In [None]:
# Preprocessing the News headline for prediction
def preprocess_text(text):
  count_vectorized_text = count_vectorizer.transform([text])
  tfidf_vectorized_text = tfidf_vectorizer.transform([text])
  document_embedding = get_document_embedding(text)
  word2vec_features = np.array([document_embedding])
  vectorized_text = hstack((count_vectorized_text, tfidf_vectorized_text, word2vec_features))
  return vectorized_text

Test_preprocessed = preprocess_text(Test)

y_pred = svc_model.predict(Test_preprocessed)

In [None]:
y_pred

array([1])