# NLP Sarcasm Detection - Preprocessing Notebook

### Project Prerequisites

In [3]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Preparing the data

In [None]:
df = pd.read_json("Data/Sarcasm_Headlines_Dataset_v2.json",lines=True)
df.drop(columns=["article_link"],inplace= True)
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


##### Checking for NaN

In [None]:

df.isnull().sum()

is_sarcastic    0
headline        0
dtype: int64

### Preprocessing

In [None]:
def preprocess(x):
    lemma_ = WordNetLemmatizer()
    is_stop = set(stopwords.words("english"))
    tokenizer = word_tokenize(x)
    
    processed = []

    for i in tokenizer:
        if i not in is_stop:
            processed.append(lemma_.lemmatize(i))
    return " ".join(processed)


In [None]:
df["processed_headline"] = df["headline"].apply(preprocess)
df.head()

Unnamed: 0,is_sarcastic,headline,processed_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientist unveil doomsday cloc...
1,0,dem rep. totally nails why congress is falling...,dem rep. totally nail congress falling short g...
2,0,eat your veggies: 9 deliciously different recipes,eat veggie : 9 deliciously different recipe
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother come pretty close using word 'streaming...


In [None]:
def re_process(x):
    re_processed = re.sub(r"[^\w\s]"," ",x)
    re_processed = re.sub(r" +"," ",re_processed)
    return re_processed.lower()

In [None]:
df["processed_headline"] = df["processed_headline"].apply(re_process)
df.head()

Unnamed: 0,is_sarcastic,headline,processed_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientist unveil doomsday cloc...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nail congress falling short ge...
2,0,eat your veggies: 9 deliciously different recipes,eat veggie 9 deliciously different recipe
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother come pretty close using word streaming ...


In [None]:
before_preprocessing = df["headline"][7]
after_preprocessing = df["processed_headline"][7]

print(f"Before Preprocessing: {before_preprocessing}")
print("-----------------------")
print(f"After Preprocessing: {after_preprocessing}")

Before Preprocessing: richard branson's global-warming donation nearly as much as cost of failed balloon trips
-----------------------
After Preprocessing: richard branson s global warming donation nearly much cost failed balloon trip


### Persisting the Data

In [None]:
df.to_csv("Persisted_data/processed_sdv2.json")
print("Done!")

Done!
