In [424]:
import pandas as pd

In [425]:
df = pd.read_csv("./training_data_lowercase.csv", delimiter='\t', header=None, names=['label', 'text'])

In [426]:
df.head()

Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [427]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [428]:
print(X_test.head())
print()
print(y_test.head())

18499     release of house tax bill delayed until thursday
24785    confusion over trump's first talks with foreig...
27282    obama says chinese-led trade deal shows need f...
6151            former pastor turned teacher beat students
16785    breaking: south carolina senate caves: votes t...
Name: text, dtype: object

18499    1
24785    1
27282    1
6151     0
16785    0
Name: label, dtype: int64


In [429]:
X_train.head()

8891                                                    so
25115    final reckoning approaches for obama's high co...
26933    illinois budget talks fizzle amid partisan ent...
26971    clinton spokesman: ig report shows no clinton ...
11387    busted! nancy pelosi claims no meeting with ru...
Name: text, dtype: object

In [430]:
df['X_train'] = X_train
df['y_train'] = y_train

test_df = df.copy()
test_df['X_test'] = X_test
test_df['y_test'] = y_test

In [431]:
test_df.head()

Unnamed: 0,label,text,X_train,y_train,X_test,y_test
0,0,donald trump sends out embarrassing new year‚s...,,,donald trump sends out embarrassing new year‚s...,0.0
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,,
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,,
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,,
4,0,pope francis just called out donald trump duri...,,,pope francis just called out donald trump duri...,0.0


In [432]:
df.head()

Unnamed: 0,label,text,X_train,y_train
0,0,donald trump sends out embarrassing new year‚s...,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0
4,0,pope francis just called out donald trump duri...,,


In [433]:
test_df = test_df.drop(columns=['label', 'text', 'X_train', 'y_train'])

In [434]:
test_df.head()

Unnamed: 0,X_test,y_test
0,donald trump sends out embarrassing new year‚s...,0.0
1,,
2,,
3,,
4,pope francis just called out donald trump duri...,0.0


In [435]:
test_df = test_df.dropna(subset=['X_test', 'y_test'])

In [436]:
test_df.head()

Unnamed: 0,X_test,y_test
0,donald trump sends out embarrassing new year‚s...,0.0
4,pope francis just called out donald trump duri...,0.0
6,fresh off the golf course,0.0
7,trump said some insanely racist stuff inside t...,0.0
8,former cia director slams trump over un bullying,0.0


## Data preprocessing

In [437]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [438]:
# tokenization
from nltk.tokenize import word_tokenize

def tokanize_data(sentence):
    if isinstance(sentence, str):
        words = word_tokenize(sentence)
        return ' '.join(words)
    else:
        return ''



In [439]:
df['clean_text'] = df['X_train'].apply(tokanize_data)

In [440]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [441]:
# remove punctuation
# remove special character
# remove numbers
# remove single character
# remove single character from start
# Substitute multiple spaces with a single space
# Remove prefixed 'b'
import string
import re

def clean_text(sentence):
    clean_text = sentence.translate(str.maketrans('','', string.punctuation))
    cleaned_text = re.sub(r'[^\w\s]', '', clean_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'\b\w{1}\b', '', cleaned_text)
    cleaned_text = re.sub(r'^\s*\w{1}\s*', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'^b\s*', '', cleaned_text)
    return clean_text

In [442]:
df['clean_text'] = df['clean_text'].apply(clean_text)

In [443]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [444]:
# remove stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [445]:
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    cleaned_text = ' '.join(word for word in words if word not in stop_words)
    return cleaned_text

In [446]:
df['clean_text'] = df['clean_text'].apply(remove_stopwords)

In [447]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes internet joke thr...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsessed even obama‚s name coded website...
4,0,pope francis just called out donald trump duri...,,,


In [448]:
# lemmatization
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ankita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [449]:
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(sentence):
    words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text

In [450]:
df['clean_text'] = df['clean_text'].apply(apply_lemmatization)

In [451]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drink brag trump staffer start russian collusi...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke become internet joke thre...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsess even obama‚s name cod website image
4,0,pope francis just called out donald trump duri...,,,


## Feature extraction

In [452]:
print("Number of NaN values in y:", df['y_train'].isna().sum())

Number of NaN values in y: 6831


In [453]:
df = df.dropna(subset=['y_train'])

In [454]:
# using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [455]:
tfidf_df.head()

Unnamed: 0,00,0149,02,025,0330,0930,10,100,1000,10000,...,zuckerberg,zulia,zuma,zummar,zurich,éblacklivesmatter,îfor,îing,îpence,øqu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model training


In [456]:
# using logistic regression
from sklearn.linear_model import LogisticRegression

In [457]:
X = tfidf_df
y = df['y_train']

print(X.head())
print()
print(y.head())

    00  0149   02  025  0330  0930   10  100  1000  10000  ...  zuckerberg  \
0  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
1  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
2  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
3  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
4  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   

   zulia  zuma  zummar  zurich  éblacklivesmatter  îfor  îing  îpence  øqu  
0    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
1    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
2    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
3    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
4    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  

[5 rows x 15626 columns]

1    0.0
2    0.0
3    0.0
5    0.0
9    0

In [458]:
data_train, data_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [459]:
model = LogisticRegression(max_iter=1000)
model.fit(data_train,label_train)

In [460]:
y_pred = model.predict(data_test)

In [461]:
from sklearn.metrics import accuracy_score, classification_report

In [462]:
# Evaluate the model
print("Accuracy:", accuracy_score(label_test, y_pred))
print("Classification Report:\n", classification_report(label_test, y_pred))

Accuracy: 0.937419945105215
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      2821
         1.0       0.93      0.94      0.94      2644

    accuracy                           0.94      5465
   macro avg       0.94      0.94      0.94      5465
weighted avg       0.94      0.94      0.94      5465



In [463]:
# now predict X_test and y_test
# X_test y_test

In [464]:
test_df.head()

Unnamed: 0,X_test,y_test
0,donald trump sends out embarrassing new year‚s...,0.0
4,pope francis just called out donald trump duri...,0.0
6,fresh off the golf course,0.0
7,trump said some insanely racist stuff inside t...,0.0
8,former cia director slams trump over un bullying,0.0


In [465]:
test_df['clean_text'] = test_df['X_test'].apply(tokanize_data)

In [466]:
test_df['clean_text'] = test_df['clean_text'].apply(clean_text)

In [467]:
test_df['clean_text'] = test_df['clean_text'].apply(remove_stopwords)

In [468]:
test_df['clean_text'] = test_df['clean_text'].apply(apply_lemmatization)

In [469]:
test_df.head()

Unnamed: 0,X_test,y_test,clean_text
0,donald trump sends out embarrassing new year‚s...,0.0,donald trump send embarrass new year‚s eve mes...
4,pope francis just called out donald trump duri...,0.0,pope francis call donald trump christmas speech
6,fresh off the golf course,0.0,fresh golf course
7,trump said some insanely racist stuff inside t...,0.0,trump say insanely racist stuff inside oval of...
8,former cia director slams trump over un bullying,0.0,former cia director slam trump un bully


In [470]:
tfidf_new_matrix = tfidf_vectorizer.transform(test_df['clean_text'])

In [471]:
tfidf_new_df = pd.DataFrame(tfidf_new_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [472]:
y_pred_new = model.predict(tfidf_new_df)

In [474]:
# Evaluate the model
print("Accuracy:", accuracy_score(test_df['y_test'], y_pred_new))
print("Classification Report:\n", classification_report(test_df['y_test'], y_pred_new))

Accuracy: 0.9329527155614112
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.93      0.93      3529
         1.0       0.92      0.94      0.93      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831

