### Introduction

This section contains an evaluation of SVM Model using three variations on the vectorizer to help achieve the best results 

TFIDF, NGram and Word2vec were compared, TFIDF had the best output.

This experiment was conducted on a complex dataset that incorporated different domains and writing styles i.e. informal and formal writing

The labels in this dataset are on sarcasm and no sarcasm (1 for sarcasm and 0 for no sarcasm)

#### Imported Libraries

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Downloading NLTK

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

#### Dataset 
This is a CSV file that was carefully curated from diffrent datasets that were publicly availble in Kaggel;
1. Twitter dataset
2. News headlines 
3. Isarcasm V1 and V2 

In [6]:
df = pd.read_csv('Generalized Sarcasm Data.csv')

#### EDA

In [8]:
df.sample(10)

Unnamed: 0,text,label
18651,alternative theater waits three hours for stra...,1
13201,rick perry returning to iowa,0
16951,5 missing after army helicopter downed near ha...,0
25073,ex-aide to gabrielle giffords faces recount in...,0
20370,new york introduces shoe-sharing program for c...,1
21434,land before time vi released straight to landfill,1
22435,"colleges suspend students for sexual assault, ...",0
5450,rosetta stone offers new spanish language cour...,1
19552,it's 2016. do you know where your bombs are fa...,0
21216,world's oldest woman just pleased every other ...,1


In [10]:
df.shape

(30177, 2)

In [12]:
df.isna().sum()

text     1
label    0
dtype: int64

#### Data Cleaning

In [14]:
df.dropna(inplace=True)

In [22]:
df.duplicated().sum()

113

In [26]:
df.drop_duplicates(inplace=True)

In [28]:
df.label.value_counts(normalize=True)*100

label
0    58.360776
1    41.639224
Name: proportion, dtype: float64

#### Data PreProcessing

In [30]:
def preprocess_text(text):
  if isinstance(text, str):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['processed_text'] = df['text'].apply(preprocess_text)

#### Data Split

In [32]:
X = df['processed_text']
y = df['label']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

#### After Grid Search Best Parameters fit

#### TFIDF

In [38]:
#Best parameters:  {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
svm = SVC(kernel='rbf', C=10, gamma='scale')
svm.fit(X_train_tfidf, y_train)

In [39]:
y_pred = svm.predict(X_test_tfidf)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.780808248794279
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      3490
           1       0.77      0.68      0.72      2523

    accuracy                           0.78      6013
   macro avg       0.78      0.77      0.77      6013
weighted avg       0.78      0.78      0.78      6013



#### Word2Vec

In [44]:
tokenized_sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [46]:
def get_avg_word_vector(sentence, model, vector_size=100):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(vector_size)

X_train_w2v = np.array([get_avg_word_vector(sent, w2v_model) for sent in X_train])
X_test_w2v = np.array([get_avg_word_vector(sent, w2v_model) for sent in X_test])

In [62]:
svm_w2v = SVC(kernel='rbf', C=10, gamma='scale')
svm_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = svm_w2v.predict(X_test_w2v)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
classification_w2v = classification_report(y_test, y_pred_w2v)
print(f"Word2Vec Accuracy: {accuracy_w2v:.4f}")
print(classification_w2v)

Word2Vec Accuracy: 0.6180
              precision    recall  f1-score   support

           0       0.61      0.98      0.75      3490
           1       0.83      0.11      0.20      2523

    accuracy                           0.62      6013
   macro avg       0.72      0.55      0.47      6013
weighted avg       0.70      0.62      0.52      6013



#### NGram

In [52]:
ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 3))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

In [64]:
svm_ngram = SVC(kernel='rbf', C=10, gamma='scale')
svm_ngram.fit(X_train_ngram, y_train)
y_pred_ngram = svm_ngram.predict(X_test_ngram)
accuracy_ngram = accuracy_score(y_test, y_pred_ngram)
classification_ngram = classification_report(y_test, y_pred_ngram)
print(f"N-gram TF-IDF Accuracy: {accuracy_ngram:.4f}")
print(classification_ngram)

N-gram TF-IDF Accuracy: 0.6715
              precision    recall  f1-score   support

           0       0.65      0.94      0.77      3490
           1       0.78      0.31      0.44      2523

    accuracy                           0.67      6013
   macro avg       0.71      0.62      0.60      6013
weighted avg       0.70      0.67      0.63      6013

