In [1]:
import pandas as pd

In [54]:
df = pd.read_csv("./training_data_lowercase.csv", delimiter='\t', header=None, names=['label', 'text'])

In [49]:
df.head()

Unnamed: 0,label,text
0,0\tdonald trump sends out embarrassing new yea...,
1,0\tdrunk bragging trump staffer started russia...,
2,0\tsheriff david clarke becomes an internet jo...,
3,0\ttrump is so obsessed he even has obama‚s na...,
4,0\tpope francis just called out donald trump d...,


In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [56]:
X_train.head()

8891                                                    so
25115    final reckoning approaches for obama's high co...
26933    illinois budget talks fizzle amid partisan ent...
26971    clinton spokesman: ig report shows no clinton ...
11387    busted! nancy pelosi claims no meeting with ru...
Name: text, dtype: object

In [57]:
df['X_train'] = X_train

In [58]:
df.head()

Unnamed: 0,label,text,X_train
0,0,donald trump sends out embarrassing new year‚s...,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,


## Data preprocessing

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [59]:
# tokenization
from nltk.tokenize import word_tokenize

def tokanize_data(sentence):
    if isinstance(sentence, str):
        words = word_tokenize(sentence)
        return ' '.join(words)
    else:
        return ''



In [61]:
df['tokenized_words'] = df['X_train'].apply(tokanize_data)

In [62]:
df.head()

Unnamed: 0,label,text,X_train,tokenized_words
0,0,donald trump sends out embarrassing new year‚s...,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,


In [63]:
# remove punctuation
# remove special character
# remove numbers
# remove single character
# remove single character from start
# Substitute multiple spaces with a single space
# Remove prefixed 'b'
import string
import re

def clean_text(sentence):
    clean_text = sentence.translate(str.maketrans('','', string.punctuation))
    cleaned_text = re.sub(r'[^\w\s]', '', clean_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'\b\w{1}\b', '', cleaned_text)
    cleaned_text = re.sub(r'^\s*\w{1}\s*', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'^b\s*', '', cleaned_text)
    return clean_text

In [64]:
df['clean_text'] = df['tokenized_words'].apply(clean_text)

In [65]:
df.head()

Unnamed: 0,label,text,X_train,tokenized_words,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [67]:
# remove stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    cleaned_text = ' '.join(word for word in sentence.split() if word not in stop_words)
    return cleaned_text

In [69]:
df['stopwords_removed'] = df['clean_text'].apply(remove_stopwords)

In [70]:
df.head()

Unnamed: 0,label,text,X_train,tokenized_words,clean_text,stopwords_removed
0,0,donald trump sends out embarrassing new year‚s...,,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes internet joke thr...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,trump obsessed even obama‚s name coded website...
4,0,pope francis just called out donald trump duri...,,,,
