# Preprocessing Text to be used in models

## Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk.stem.snowball import SnowballStemmer
import tqdm


## Read the csv and extract the comments and the labels

In [2]:
dir = "../input/sarcastic-comments-on-reddit/train-balanced-sarcasm.csv"
data = pd.read_csv(dir)
data.head(5)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [3]:
comments = data['comment'].values
labels = data['label'].values

## Cleaning the text and removing links/ punctuation and so on..

In [4]:
text_cleaning = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stemmer = SnowballStemmer('english', ignore_stopwords=False)

In [5]:
def preprocess_data(text):
    text = re.sub(text_cleaning, ' ', str(text).lower()).strip()
    text = stemmer.stem(str(text))
    return text

X = []
for i in tqdm.tqdm(range(len(comments))):
    X.append(preprocess_data(comments[i]))

100%|██████████| 1010826/1010826 [00:53<00:00, 19053.89it/s]


## Tokenize the cleaned text and pad them accordingly

In [6]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X)

In [7]:
sequences = tokenizer.texts_to_sequences(X)
padded = pad_sequences(sequences, padding='post', maxlen=20)

## Split the data into train and test sets

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(np.array(padded), np.array(labels))

## Now the data is ready to be used as inputs in a neural network

In [9]:
print(f"Actual Sentence: {comments[860]}\nStemmed Sentence: {X[860]}\nTokenized: {sequences[860]}\nPadded: {padded[860]}")

Actual Sentence: This comment makes me so hungry
Stemmed Sentence: this comment makes me so hungri
Tokenized: [20, 387, 181, 71, 25, 9377]
Padded: [  20  387  181   71   25 9377    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
