In [146]:
import pandas as pd

In [147]:
df = pd.read_csv("./training_data_lowercase.csv", delimiter='\t', header=None, names=['label', 'text'])

In [148]:
df.head()

Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [149]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [150]:
X_train.head()

8891                                                    so
25115    final reckoning approaches for obama's high co...
26933    illinois budget talks fizzle amid partisan ent...
26971    clinton spokesman: ig report shows no clinton ...
11387    busted! nancy pelosi claims no meeting with ru...
Name: text, dtype: object

In [151]:
df['X_train'] = X_train
df['y_train'] = y_train

In [152]:
df.head()

Unnamed: 0,label,text,X_train,y_train
0,0,donald trump sends out embarrassing new year‚s...,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0
4,0,pope francis just called out donald trump duri...,,


## Data preprocessing

In [153]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [154]:
# tokenization
from nltk.tokenize import word_tokenize

def tokanize_data(sentence):
    if isinstance(sentence, str):
        words = word_tokenize(sentence)
        return ' '.join(words)
    else:
        return ''



In [155]:
df['clean_text'] = df['X_train'].apply(tokanize_data)

In [156]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [157]:
# remove punctuation
# remove special character
# remove numbers
# remove single character
# remove single character from start
# Substitute multiple spaces with a single space
# Remove prefixed 'b'
import string
import re

def clean_text(sentence):
    clean_text = sentence.translate(str.maketrans('','', string.punctuation))
    cleaned_text = re.sub(r'[^\w\s]', '', clean_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'\b\w{1}\b', '', cleaned_text)
    cleaned_text = re.sub(r'^\s*\w{1}\s*', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'^b\s*', '', cleaned_text)
    return clean_text

In [158]:
df['clean_text'] = df['clean_text'].apply(clean_text)

In [159]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [160]:
# remove stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [161]:
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    cleaned_text = ' '.join(word for word in words if word not in stop_words)
    return cleaned_text

In [162]:
df['clean_text'] = df['clean_text'].apply(remove_stopwords)

In [163]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes internet joke thr...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsessed even obama‚s name coded website...
4,0,pope francis just called out donald trump duri...,,,


In [164]:
# lemmatization
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ankita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [165]:
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(sentence):
    words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text

In [166]:
df['clean_text'] = df['clean_text'].apply(apply_lemmatization)

In [167]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drink brag trump staffer start russian collusi...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke become internet joke thre...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsess even obama‚s name cod website image
4,0,pope francis just called out donald trump duri...,,,


## Feature extraction

In [172]:
# using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original DataFrame
df_combined = pd.concat([df.reset_index(drop=True), tfidf_df], axis=1)

# Display the combined DataFrame
print(df_combined.head())

# for doc, tf_idf_doc in zip(df['lemmatized_text'], tfidf_matrix.todense()):
#     print("DOC:", doc)
#     print(np.around(tf_idf_doc,5))
#     print()

   label                                               text  \
0      0  donald trump sends out embarrassing new year‚s...   
1      0  drunk bragging trump staffer started russian c...   
2      0  sheriff david clarke becomes an internet joke ...   
3      0  trump is so obsessed he even has obama‚s name ...   
4      0  pope francis just called out donald trump duri...   

                                             X_train  y_train  \
0                                                NaN      NaN   
1  drunk bragging trump staffer started russian c...      0.0   
2  sheriff david clarke becomes an internet joke ...      0.0   
3  trump is so obsessed he even has obama‚s name ...      0.0   
4                                                NaN      NaN   

                                          clean_text   00  0149   02  025  \
0                                                     0.0   0.0  0.0  0.0   
1  drink brag trump staffer start russian collusi...  0.0   0.0  0.0  0.0   