In [None]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [15]:
filename = r'C:\Users\UBL-HO.DESKTOP-7ET3E40\Desktop\BAI-6A\IR\Sarcasm Detection - Feature selection\Sarcasm_Headlines_Dataset.json'

data = []
with open(filename, "r") as file:
    for line in file:
        data.append(json.loads(line))

In [16]:
df = pd.DataFrame(data)
df

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


In [17]:
df = df.drop('article_link', axis = 1)
df

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      26709 non-null  object
 1   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 417.5+ KB


In [19]:
df.isna().sum()

headline        0
is_sarcastic    0
dtype: int64

In [20]:
df['is_sarcastic'].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(df['headline'])
y = df['is_sarcastic']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
def select_k_best(X_train, X_test, y_train, k=500):
    selector = SelectKBest(chi2, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected

In [24]:
def select_mutual_info(X_train, X_test, y_train, k=500):
    selector = SelectKBest(mutual_info_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected

In [25]:
def apply_pca(X_train, X_test, n_components=500):
    pca = TruncatedSVD(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

In [26]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

In [27]:
X_train_selected, X_test_selected = select_k_best(X_train, X_test, y_train)
print("SelectKBest with chi-squared scoring:")
train_and_evaluate(X_train_selected, X_test_selected, y_train, y_test)

SelectKBest with chi-squared scoring:
              precision    recall  f1-score   support

           0       0.86      0.73      0.79      2996
           1       0.71      0.85      0.78      2346

    accuracy                           0.78      5342
   macro avg       0.79      0.79      0.78      5342
weighted avg       0.80      0.78      0.78      5342



In [28]:
X_train_selected, X_test_selected = select_mutual_info(X_train, X_test, y_train)
print("SelectKBest with Mutual Information:")
train_and_evaluate(X_train_selected, X_test_selected, y_train, y_test)



In [None]:
X_train_pca, X_test_pca = apply_pca(X_train, X_test)
print("PCA:")
train_and_evaluate(X_train_pca, X_test_pca, y_train, y_test)