In [244]:
import pandas as pd

In [245]:
df = pd.read_csv("./training_data_lowercase.csv", delimiter='\t', header=None, names=['label', 'text'])

In [246]:
df.head()

Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [247]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [248]:
X_train.head()

8891                                                    so
25115    final reckoning approaches for obama's high co...
26933    illinois budget talks fizzle amid partisan ent...
26971    clinton spokesman: ig report shows no clinton ...
11387    busted! nancy pelosi claims no meeting with ru...
Name: text, dtype: object

In [249]:
df['X_train'] = X_train
df['y_train'] = y_train

In [250]:
df.head()

Unnamed: 0,label,text,X_train,y_train
0,0,donald trump sends out embarrassing new year‚s...,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0
4,0,pope francis just called out donald trump duri...,,


## Data preprocessing

In [251]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [252]:
# tokenization
from nltk.tokenize import word_tokenize

def tokanize_data(sentence):
    if isinstance(sentence, str):
        words = word_tokenize(sentence)
        return ' '.join(words)
    else:
        return ''



In [253]:
df['clean_text'] = df['X_train'].apply(tokanize_data)

In [254]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [255]:
# remove punctuation
# remove special character
# remove numbers
# remove single character
# remove single character from start
# Substitute multiple spaces with a single space
# Remove prefixed 'b'
import string
import re

def clean_text(sentence):
    clean_text = sentence.translate(str.maketrans('','', string.punctuation))
    cleaned_text = re.sub(r'[^\w\s]', '', clean_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'\b\w{1}\b', '', cleaned_text)
    cleaned_text = re.sub(r'^\s*\w{1}\s*', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'^b\s*', '', cleaned_text)
    return clean_text

In [256]:
df['clean_text'] = df['clean_text'].apply(clean_text)

In [257]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...,,,


In [258]:
# remove stopwords
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [259]:
def remove_stopwords(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    cleaned_text = ' '.join(word for word in words if word not in stop_words)
    return cleaned_text

In [260]:
df['clean_text'] = df['clean_text'].apply(remove_stopwords)

In [261]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke becomes internet joke thr...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsessed even obama‚s name coded website...
4,0,pope francis just called out donald trump duri...,,,


In [262]:
# lemmatization
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/ankita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ankita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [263]:
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(sentence):
    words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    cleaned_text = ' '.join(lemmatized_words)
    return cleaned_text

In [264]:
df['clean_text'] = df['clean_text'].apply(apply_lemmatization)

In [265]:
df.head()

Unnamed: 0,label,text,X_train,y_train,clean_text
0,0,donald trump sends out embarrassing new year‚s...,,,
1,0,drunk bragging trump staffer started russian c...,drunk bragging trump staffer started russian c...,0.0,drink brag trump staffer start russian collusi...
2,0,sheriff david clarke becomes an internet joke ...,sheriff david clarke becomes an internet joke ...,0.0,sheriff david clarke become internet joke thre...
3,0,trump is so obsessed he even has obama‚s name ...,trump is so obsessed he even has obama‚s name ...,0.0,trump obsess even obama‚s name cod website image
4,0,pope francis just called out donald trump duri...,,,


## Feature extraction

In [266]:
print("Number of NaN values in y:", df['y_train'].isna().sum())

Number of NaN values in y: 6831


In [267]:
df = df.dropna(subset=['y_train'])

In [None]:
# using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# tfidf_vectorizer.transform() TODO
# Concatenate the TF-IDF DataFrame with the original DataFrame
#df_combined = pd.concat([df.reset_index(drop=True), tfidf_df], axis=1)

# Display the combined DataFrame
#print(df_combined.head())

# for doc, tf_idf_doc in zip(df['lemmatized_text'], tfidf_matrix.todense()):
#     print("DOC:", doc)
#     print(np.around(tf_idf_doc,5))
#     print()

   label                                               text  \
0      0  drunk bragging trump staffer started russian c...   
1      0  sheriff david clarke becomes an internet joke ...   
2      0  trump is so obsessed he even has obama‚s name ...   
3      0  racist alabama cops brutalize black boy while ...   
4      0  brand-new pro-trump ad features so much a** ki...   

                                             X_train  y_train  \
0  drunk bragging trump staffer started russian c...      0.0   
1  sheriff david clarke becomes an internet joke ...      0.0   
2  trump is so obsessed he even has obama‚s name ...      0.0   
3  racist alabama cops brutalize black boy while ...      0.0   
4  brand-new pro-trump ad features so much a** ki...      0.0   

                                          clean_text   00  0149   02  025  \
0  drink brag trump staffer start russian collusi...  0.0   0.0  0.0  0.0   
1  sheriff david clarke become internet joke thre...  0.0   0.0  0.0  0.0   

In [269]:
tfidf_df.head()

Unnamed: 0,00,0149,02,025,0330,0930,10,100,1000,10000,...,zuckerberg,zulia,zuma,zummar,zurich,éblacklivesmatter,îfor,îing,îpence,øqu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model training


In [270]:
# using logistic regression
from sklearn.linear_model import LogisticRegression

In [271]:
X = tfidf_df
y = df['y_train']

print(X.head())
print()
print(y.head())

    00  0149   02  025  0330  0930   10  100  1000  10000  ...  zuckerberg  \
0  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
1  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
2  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
3  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   
4  0.0   0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0    0.0  ...         0.0   

   zulia  zuma  zummar  zurich  éblacklivesmatter  îfor  îing  îpence  øqu  
0    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
1    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
2    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
3    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  
4    0.0   0.0     0.0     0.0                0.0   0.0   0.0     0.0  0.0  

[5 rows x 15626 columns]

1    0.0
2    0.0
3    0.0
5    0.0
9    0

In [272]:
data_train, data_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [273]:
model = LogisticRegression(max_iter=1000)
model.fit(data_train,label_train)

In [274]:
y_pred = model.predict(data_test)

In [275]:
from sklearn.metrics import accuracy_score, classification_report

In [277]:
# Evaluate the model
print("Accuracy:", accuracy_score(label_test, y_pred))
print("Classification Report:\n", classification_report(label_test, y_pred))

Accuracy: 0.937419945105215
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      2821
         1.0       0.93      0.94      0.94      2644

    accuracy                           0.94      5465
   macro avg       0.94      0.94      0.94      5465
weighted avg       0.94      0.94      0.94      5465



In [278]:
# now predict X_test and y_test
y_pred_for_new_label = model.predict(X_test)



ValueError: could not convert string to float: 'release of house tax bill delayed until thursday'