# Reading Data

In [None]:
import pandas as pd

df = pd.DataFrame(pd.read_csv('/content/train.En.csv', index_col=[0]))
df.dropna(inplace=True)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


# Tokenizing

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk import word_tokenize

sentences = df['tweet'].values
tokens = word_tokenize(sentences[0])
tokens

['The',
 'only',
 'thing',
 'I',
 'got',
 'from',
 'college',
 'is',
 'a',
 'caffeine',
 'addiction']

# PoS Tagging

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk import pos_tag

pos_tag(tokens)

[('The', 'DT'),
 ('only', 'JJ'),
 ('thing', 'NN'),
 ('I', 'PRP'),
 ('got', 'VBD'),
 ('from', 'IN'),
 ('college', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('caffeine', 'JJ'),
 ('addiction', 'NN')]

# Lemmatization & Stemming

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk import WordNetLemmatizer

def lemmatize(word:str, tag=None) -> str:
  lemmatizer = WordNetLemmatizer()

  return lemmatizer.lemmatize(word,tag) if tag else  lemmatizer.lemmatize(word)


In [None]:
from nltk import PorterStemmer

def stem(word:str) -> str:
  stemmer = PorterStemmer()
  return stemmer.stem(word)

In [None]:
word = 'worse'
tag = 'a'

f'Stem: {stem(word)}'

'Stem: wors'

In [None]:
lemmatize(word)
f'Lemma with PoS tag: {lemmatize(word, tag)}'

'Lemma with PoS tag: bad'

In [None]:
f'Lemma without PoS tag: {lemmatize(word)}'

'Lemma without PoS tag: worse'

# Sarcasm Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

X = df['tweet'].values
y = df['sarcastic']

tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1,2), stop_words={'english'})
X = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

rdf = RandomForestClassifier(n_estimators = 2500)
rdf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
f'Shape of embeddings {X_train[0].shape}'

'Shape of embeddings (1, 1500)'

In [None]:
y_pred = rdf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0