In [244]:
import pandas as pd

In [320]:
df = pd.read_csv("./training_data_lowercase.csv", delimiter='\t', header=None, names=['label', 'text'])

In [321]:
df.head()

Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [322]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [323]:
print(X_test.head())
print()
print(y_test.head())

18499     release of house tax bill delayed until thursday
24785    confusion over trump's first talks with foreig...
27282    obama says chinese-led trade deal shows need f...
6151            former pastor turned teacher beat students
16785    breaking: south carolina senate caves: votes t...
Name: text, dtype: object

18499    1
24785    1
27282    1
6151     0
16785    0
Name: label, dtype: int64


In [324]:
X_train.head()

8891                                                    so
25115    final reckoning approaches for obama's high co...
26933    illinois budget talks fizzle amid partisan ent...
26971    clinton spokesman: ig report shows no clinton ...
11387    busted! nancy pelosi claims no meeting with ru...
Name: text, dtype: object

In [325]:
df['X_train'] = X_train
df['y_train'] = y_train

test_df = df.copy()
test_df['X_test'] = X_test
test_df['y_test'] = y_test

In [326]:
test_df.head()

Unnamed: 0,label,text,X_train,y_train,X_test,y_test
0,0,donald trump sends out embarrassing new year‚s...,,,donald trump sends out embarrassing new year‚s...,0.0
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,,
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,,
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,,
4,0,pope francis just called out donald trump duri...,,,pope francis just called out donald trump duri...,0.0


In [327]:
df.head()

Unnamed: 0,label,text,X_train,y_train
0,0,donald trump sends out embarrassing new year‚s...,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0
4,0,pope francis just called out donald trump duri...,,


In [328]:
test_df = test_df.drop(columns=['label', 'text', 'X_train', 'y_train'])

In [329]:
test_df.head()

Unnamed: 0,X_test,y_test
0,donald trump sends out embarrassing new year‚s...,0.0
1,,
2,,
3,,
4,pope francis just called out donald trump duri...,0.0


In [330]:
test_df = test_df.dropna(subset=['X_test', 'y_test'])

In [331]:
test_df.head()

Unnamed: 0,X_test,y_test
0,donald trump sends out embarrassing new year‚s...,0.0
4,pope francis just called out donald trump duri...,0.0
6,fresh off the golf course,0.0
7,trump said some insanely racist stuff inside t...,0.0
8,former cia director slams trump over un bullying,0.0


## Data preprocessing

In [251]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [332]:
# tokenization
from nltk.tokenize import word_tokenize

def tokanize_data(sentence):
    if isinstance(sentence, str):
        words = word_tokenize(sentence)
        print(' '.join(words))
        return ' '.join(words)
    else:
        return ''



In [333]:
df['clean_text'] = df['X_train'].apply(tokanize_data)

drunk bragging trump staffer started russian collusion investigation
sheriff david clarke becomes an internet joke for threatening to poke people ‚in the eye‚
trump is so obsessed he even has obama‚s name coded into his website ( images )
racist alabama cops brutalize black boy while he is in handcuffs ( graphic images )
brand-new pro-trump ad features so much a * * kissing it will make you sick
papa john‚s founder retires
paul ryan just told us he doesn‚t care about struggling families living in blue states
bad news for trump ‚î mitch mcconnell says no to repealing obamacare in 2018
lindsey graham trashes media for portraying trump as ‚kooky‚ forgets his own words ''
heiress to disney empire knows gop scammed us ‚ shreds them for tax bill
tone deaf trump : congrats rep. scalise on losing weight after you almost died
the internet brutally mocks disney‚s new trump robot at hall of presidents
snl hilariously mocks accused child molester roy moore for losing al senate race ( video )
repub

In [334]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [335]:
# remove punctuation
# remove special character
# remove numbers
# remove single character
# remove single character from start
# Substitute multiple spaces with a single space
# Remove prefixed 'b'
import string
import re

def clean_text(sentence):
    clean_text = sentence.translate(str.maketrans('','', string.punctuation))
    cleaned_text = re.sub(r'[^\w\s]', '', clean_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'\b\w{1}\b', '', cleaned_text)
    cleaned_text = re.sub(r'^\s*\w{1}\s*', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'^b\s*', '', cleaned_text)
    return clean_text

In [336]:
df['clean_text'] = df['clean_text'].apply(clean_text)

In [337]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [338]:
# remove stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [339]:
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    cleaned_text = ' '.join(word for word in words if word not in stop_words)
    return cleaned_text

In [340]:
df['clean_text'] = df['clean_text'].apply(remove_stopwords)

In [341]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes internet joke thr...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsessed even obama‚s name coded website...
4,0,pope francis just called out donald trump duri...,,,


In [342]:
# lemmatization
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ankita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [343]:
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(sentence):
    words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text

In [344]:
df['clean_text'] = df['clean_text'].apply(apply_lemmatization)

In [345]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drink brag trump staffer start russian collusi...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke become internet joke thre...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsess even obama‚s name cod website image
4,0,pope francis just called out donald trump duri...,,,


## Feature extraction

In [346]:
print("Number of NaN values in y:", df['y_train'].isna().sum())

Number of NaN values in y: 6831


In [347]:
df = df.dropna(subset=['y_train'])

In [348]:
# using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# tfidf_vectorizer.transform() TODO
# Concatenate the TF-IDF DataFrame with the original DataFrame
#df_combined = pd.concat([df.reset_index(drop=True), tfidf_df], axis=1)

# Display the combined DataFrame
#print(df_combined.head())

# for doc, tf_idf_doc in zip(df['lemmatized_text'], tfidf_matrix.todense()):
#     print("DOC:", doc)
#     print(np.around(tf_idf_doc,5))
#     print()

In [349]:
tfidf_df.head()

Unnamed: 0,00,0149,02,025,0330,0930,10,100,1000,10000,...,zuckerberg,zulia,zuma,zummar,zurich,éblacklivesmatter,îfor,îing,îpence,øqu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model training


In [350]:
# using logistic regression
from sklearn.linear_model import LogisticRegression

In [351]:
X = tfidf_df
y = df['y_train']

print(X.head())
print()
print(y.head())

    00  0149   02  025  0330  0930   10  100  1000  10000  ...  zuckerberg  \
0  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
1  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
2  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
3  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
4  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   

   zulia  zuma  zummar  zurich  éblacklivesmatter  îfor  îing  îpence  øqu  
0    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
1    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
2    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
3    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
4    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  

[5 rows x 15626 columns]

1    0.0
2    0.0
3    0.0
5    0.0
9    0

In [352]:
data_train, data_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [353]:
model = LogisticRegression(max_iter=1000)
model.fit(data_train,label_train)

In [354]:
y_pred = model.predict(data_test)

In [355]:
from sklearn.metrics import accuracy_score, classification_report

In [356]:
# Evaluate the model
print("Accuracy:", accuracy_score(label_test, y_pred))
print("Classification Report:\n", classification_report(label_test, y_pred))

Accuracy: 0.937419945105215
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      2821
         1.0       0.93      0.94      0.94      2644

    accuracy                           0.94      5465
   macro avg       0.94      0.94      0.94      5465
weighted avg       0.94      0.94      0.94      5465



In [None]:
# now predict X_test and y_test
# X_test y_test

In [358]:
test_df.head()

Unnamed: 0,X_test,y_test
0,donald trump sends out embarrassing new year‚s...,0.0
4,pope francis just called out donald trump duri...,0.0
6,fresh off the golf course,0.0
7,trump said some insanely racist stuff inside t...,0.0
8,former cia director slams trump over un bullying,0.0


In [359]:
test_df['clean_text'] = test_df['X_test'].apply(tokanize_data)

donald trump sends out embarrassing new year‚s eve message ; this is disturbing
pope francis just called out donald trump during his christmas speech
fresh off the golf course
trump said some insanely racist stuff inside the oval office
former cia director slams trump over un bullying
mueller spokesman just f-cked up donald trump‚s christmas
cnn calls it : a democrat will represent alabama in the senate for the first time in 25 years
trump only cares about trump ; why he is recognizing jerusalem today
trump supporting coal ceo upset trump is wiping out ‚thousands‚ of coal mining jobs
breaking : michael flynn cracks ‚ will testify to mueller against trump himself
americans once elected a president after he was accused of raping a 13-year-old girl
breitbart editor to cnn host : a song by ringo starr proves roy moore isn‚t a pedo ( video )
ivanka defends malia obama from attacks
fox news bans gene simmons for life for harassing staff off camera
donald trump is destroying the country‚s rep

In [360]:
test_df['clean_text'] = test_df['clean_text'].apply(clean_text)

In [361]:
test_df['clean_text'] = test_df['clean_text'].apply(remove_stopwords)

In [362]:
test_df['clean_text'] = test_df['clean_text'].apply(apply_lemmatization)

In [363]:
test_df.head()

Unnamed: 0,X_test,y_test,clean_text
0,donald trump sends out embarrassing new year‚s...,0.0,donald trump send embarrass new year‚s eve mes...
4,pope francis just called out donald trump duri...,0.0,pope francis call donald trump christmas speech
6,fresh off the golf course,0.0,fresh golf course
7,trump said some insanely racist stuff inside t...,0.0,trump say insanely racist stuff inside oval of...
8,former cia director slams trump over un bullying,0.0,former cia director slam trump un bully


In [364]:
tfidf_new_matrix = tfidf_vectorizer.transform(test_df['clean_text'])

In [368]:
tfidf_new_df = pd.DataFrame(tfidf_new_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [369]:
y_pred_new = model.predict(tfidf_new_df)

In [370]:
# Display predictions
print(y_pred_new)

[0. 0. 1. ... 1. 1. 1.]


In [371]:
# Evaluate the model
print("Accuracy:", accuracy_score(test_df['y_test'], y_pred_new))
print("Classification Report:\n", classification_report(test_df['y_test'], y_pred_new))

Accuracy: 0.9329527155614112
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.93      0.93      3529
         1.0       0.92      0.94      0.93      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831

