In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [55]:
dataset_path = '/content/drive/MyDrive/iSarcasm/train.En.csv'
df = pd.read_csv(dataset_path)[["tweet", "sarcastic"]]
df = df[df['tweet'].notna()]
X, y = df[["tweet"]], df[["sarcastic"]]

In [56]:
X.head()

Unnamed: 0,tweet
0,The only thing I got from college is a caffein...
1,I love it when professors draw a big question ...
2,Remember the hundred emails from companies whe...
3,Today my pop-pop told me I was not “forced” to...
4,@VolphanCarol @littlewhitty @mysticalmanatee I...


In [57]:
y.head()

Unnamed: 0,sarcastic
0,1
1,1
2,1
3,1
4,1


In [58]:
y.value_counts()

sarcastic
0            2600
1             867
dtype: int64

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X['tweet'].values.astype('U'))
X_train_counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count_vect.get_feature_names(),columns=["idf_weights"]) 
df_idf.sort_values(by=['idf_weights'])



Unnamed: 0,idf_weights
the,1.853177
to,2.034416
and,2.156772
my,2.422993
of,2.492328
...,...
holby,8.458474
hogs,8.458474
hog,8.458474
homewreckermovie,8.458474


In [None]:
def report_acc_cv(clf, X, y, model_name, cv=10, scoring='accuracy'):

  acc = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
  accb = cross_val_score(clf, X, y, cv=cv, scoring='balanced_accuracy')
  f1 = cross_val_score(clf, X, y, cv=cv, scoring='f1')
  p = cross_val_score(clf, X, y, cv=cv, scoring='precision')
  r = cross_val_score(clf, X, y, cv=cv, scoring='recall')

  print(model_name," accuracy is: %.2f%% +- %.2f%%" %(np.mean(acc)*100,np.std(acc)*100))
  print(model_name," balanced accuracy is: %.2f%% +- %.2f%%" %(np.mean(accb)*100,np.std(accb)*100))
  print(model_name," f1-score is: %.2f%% +- %.2f%%" %(np.mean(f1)*100,np.std(f1)*100))
  print(model_name," precision is: %.2f%% +- %.2f%%" %(np.mean(p)*100,np.std(p)*100))
  print(model_name," recall is: %.2f%% +- %.2f%%" %(np.mean(r)*100,np.std(r)*100))

# Result with count vectorizer

In [None]:
X_train = X_train_counts
class_weight= {1: 3, 0: 1}

clf = SVC(C=10, kernel='rbf', class_weight=class_weight)
report_acc_cv(clf, X_train, y.values.ravel(), "svm")

svm  accuracy is: 73.41% +- 1.70%
svm  balanced accuracy is: 52.60% +- 2.21%
svm  f1-score is: 17.03% +- 5.43%
svm  precision is: 38.72% +- 11.72%
svm  recall is: 10.96% +- 3.58%


# Result with TF-IDF

In [None]:
X_train = X_train_tfidf
class_weight= {1: 3, 0: 1}

clf = SVC(C=10, kernel='rbf', class_weight=class_weight)
report_acc_cv(clf, X_train, y.values.ravel(), "svm")

svm  accuracy is: 75.17% +- 0.79%
svm  balanced accuracy is: 52.08% +- 1.25%
svm  f1-score is: 10.51% +- 4.12%
svm  precision is: 51.95% +- 15.43%
svm  recall is: 5.88% +- 2.39%


# Result with BERT (Word Tokenization Format)

In [None]:
!pip install bert-embedding
!pip install mxnet-cu100
!pip install sentence-transformers

Collecting numpy==1.14.6
  Using cached numpy-1.14.6-cp37-cp37m-manylinux1_x86_64.whl (13.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.4
    Uninstalling numpy-1.21.4:
      Successfully uninstalled numpy-1.21.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.14.6 which is incompatible.
xarray 0.18.2 requires numpy>=1.17, but you have numpy 1.14.6 which is incompatible.
transformers 4.12.5 requires numpy>=1.17, but you have numpy 1.14.6 which is incompatible.
tifffile 2021.11.2 requires numpy>=1.15.1, but you have numpy 1.14.6 which is incompatible.
spacy 2.2.4 requires numpy>=1.15.0, but you have numpy 1.14.6 which is incompatible.
seaborn 0.11.2 requires numpy>=1.15, but you have numpy 1.14.6 which is incompatib

Collecting numpy<2.0.0,>1.16.0
  Using cached numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.14.6
    Uninstalling numpy-1.14.6:
      Successfully uninstalled numpy-1.14.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.21.4 which is incompatible.
mxnet 1.4.0 requires numpy<1.15.0,>=1.8.2, but you have numpy 1.21.4 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
bert-embedding 1.0.1 requires numpy==1.14.6, but you have numpy 1.21.4 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully install



In [None]:
import re

def embeddToBERT(text):
    sentences = re.split('!|\?|\.',text)
    sentences = list(filter(None, sentences)) 

    if bert_version == 'WORD':
        result = bert(sentences, 'avg') # avg is refer to handle OOV
    
        bert_vocabs_of_sentence = []
        for sentence in range(len(result)):
            for word in range(len(result[sentence][1])):
                bert_vocabs_of_sentence.append(result[sentence][1][word])
        feature = [mean(x) for x in zip(*bert_vocabs_of_sentence)]

    elif bert_version == 'SENTENCE':
        result = bert_transformers.encode(sentences)
        feature = [mean(x) for x in zip(*result)]
  
    return feature

In [None]:
import mxnet as mx
from bert_embedding import BertEmbedding
from sentence_transformers import SentenceTransformer
import itertools

def mean(z):
    return sum(itertools.chain(z))/len(z)

bert_version = 'WORD'

ctx = mx.gpu(0)
bert = BertEmbedding(ctx=ctx)

bert_word_training_features = X['tweet'].apply(embeddToBERT)

In [None]:
feature = [x for x in bert_word_training_features.transpose()]
bert_word_training_features = np.asarray(feature)

In [None]:
class_weight= {1: 3, 0: 1}

clf = SVC(C=10, kernel='rbf', class_weight=class_weight)
report_acc_cv(clf, bert_word_training_features, y.values.ravel(), "svm")

svm  accuracy is: 71.42% +- 1.90%
svm  balanced accuracy is: 58.42% +- 2.29%
svm  f1-score is: 36.16% +- 4.01%
svm  precision is: 41.05% +- 4.63%
svm  recall is: 32.41% +- 3.97%


# Result with Word2Vec

In [95]:
from nltk.tokenize import sent_tokenize, word_tokenize 

def embeddToWord2Vec(text):
    words = word_tokenize(text)
    
    if embedding is 'WORD2VEC_WITH_STOP':
        result = [w2v_with_stop_model.wv[w] for w in words if w in w2v_with_stop_model.wv.vocab]
    else:
        result = [w2v_no_stop_model.wv[w] for w in words if w in w2v_no_stop_model.wv.vocab]
    
    feature = [mean(x) for x in zip(*result)]
    return feature

def wordTokenize(text):
  return word_tokenize(text)

In [96]:
embedding = 'WORD2VEC_WITH_STOP'

In [127]:
import gensim

words = X['tweet'].apply(wordTokenize)
w2v_with_stop_model = gensim.models.Word2Vec(words, min_count = 2, size = 100, window = 5) 

In [128]:
word2vec_with_stop_training_features = X['tweet'].apply(embeddToWord2Vec)

feature = []
deleted_indexes = []
i = 0
for x in word2vec_with_stop_training_features.transpose():
  if x != []:
    feature.append(x)
  else:
    deleted_indexes.append(i)
  i += 1
word2vec_with_stop_training_features = np.asarray(feature)

In [134]:
class_weight= {1: 3, 0: 1}

clf = SVC(C=10, kernel='rbf', class_weight=class_weight)
report_acc_cv(clf, word2vec_with_stop_training_features, y.drop(deleted_indexes).values.ravel(), "svm")

svm  accuracy is: 30.71% +- 2.63%
svm  balanced accuracy is: 50.58% +- 1.57%
svm  f1-score is: 39.41% +- 1.48%
svm  precision is: 25.23% +- 0.65%
svm  recall is: 90.33% +- 7.92%
