Prepare environment

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

Load the files and view first column and first row

In [17]:
#Load the training data
train_df=pd.read_csv("data/training_data_lowercase.csv", 
                     header=None, names=["text"])
#Load the testing data
test_df=pd.read_csv("data/testing_data_lowercase_nolabels.csv",
                    header=None, names=["text"])

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()

Train shape: (68307, 1)
Test shape: (19971, 1)


Unnamed: 0,text
0,<<<<<<< HEAD
1,﻿0\tdonald trump sends out embarrassing new ye...
2,0\tdrunk bragging trump staffer started russia...
3,0\tsheriff david clarke becomes an internet jo...
4,0\ttrump is so obsessed he even has obama‚s na...


In [None]:
#Remove prefixes and reset row 0

In [18]:
bad_prefixes=("<<<<<<<","=======",">>>>>>>")
train_df=train_df[~train_df["text"].str.startswith(bad_prefixes)]
test_df=test_df[~test_df["text"].str.startswith(bad_prefixes)]

#Reset row index 
train_df=train_df.reset_index(drop=True)
test_df=test_df.reset_index(drop=True)

train_df.head()


Unnamed: 0,text
0,﻿0\tdonald trump sends out embarrassing new ye...
1,0\tdrunk bragging trump staffer started russia...
2,0\tsheriff david clarke becomes an internet jo...
3,0\ttrump is so obsessed he even has obama‚s na...
4,0\tpope francis just called out donald trump d...


Data Preprocessing

Tokenization 


In [None]:
text=
tokens= word_tokenize(text)
print("Tokens:", tokens)

Removing stopwords

In [None]:
stop_words = set(stopwords.words('english'))
filtered_tokens=[token for token in tokens if token.lower() not in stop_words]

Split train and validation

In [None]:
X=train_df["text"]
y=train_df["label"]

X_train,X_val,y_train,y_val=train_test_split(
    X,y,test_size=0.2, random_state=42,stratify=y
)

Feature Extraction TF-IDF

In [None]:
#Initialize the TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer()

#Fit and transform the corpus into a TF-IDF representation
X_tfidf=tfidf_vectorizer.fit_transform(corpus)

#Show results
print("TF-IDF:\n", X_tfidf.toarray())
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())

Vectorize with Naive Bayes

In [None]:
def text_preprocessing_pipeline(text):
    # Step 1: Tokenize the text
    tokens= word_tokenize(text.lower())
    # Step 2: Remove stop words
    stop_words=set(stopwords.words('english'))
    tokens=[token for token in tokens if token not in stop_words]
    # Step 3: Remove punctuation
    tokens=[token for token in tokens if token not in string.punctuation]
    # Step 4: Apply lemmatization
    lemmatizer=WordNetLemmatizer()
    lemmatized_tokens=[lemmatizer.lemmatize(token) for token in tokens]

    return lemmatized_tokens

In [None]:
model=make_pipeline(
    TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2), #try 1-3
        min_df=2,
        max_df=0.9,
        max_features=50000
    ),
    MultinomialNB(alpha=0.5) #try 0.1 and 1.0
)

model.fit(X_train,y_train)

Evaluate on validation set

In [None]:
y_pred=model.predict(X_val)
print("Validation accuracy:", round(accuracy_score(y_val,y_pred),4))
print(classification_report(y_val,y_pred,digits=3))
print(confusion_matrix(y_val,y_pred))


Now train on all data and predict test

In [None]:
#retrain on full training data
model.fit(train_df["text"], train_df["label"])

#Predict labels for test
test_pred=model.predict(test_df["text"]).astype(int)

