Prepare environment

In [5]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\appul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\appul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Load the files

In [6]:
#Load the training data
train=pd.read_csv("data/training_data_lowercase.csv",
                   sep="\t",
                   header=None,
                   names=["label","text"])

#Load the testing data
test=pd.read_csv("data/testing_data_lowercase_nolabels.csv",
                  header=None,
                  names=["text"])

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head())
print(train.columns.tolist())

Train shape: (34152, 2)
Test shape: (19971, 1)
   label                                               text
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...
['label', 'text']


Preprocessing

In [None]:
def clean_text(text):
    text=re.sub("r[^a-z\s]","",text)
    #Tokenize
    tokens=word_tokenize(text)
    #Stopwords
    sw=set(stopwords.words("english"))
    tokens=[w for w in tokens if w not in sw]
    #Lemmatization
    lem=WordNetLemmatizer()
    tokens=[lem.lemmatize(w) for w in tokens]
    #Remove short tokens
    tokens=[w for w in tokens if len(w)>2]
    return"".join(tokens)

print("\n Cleaning text")
train["clean_text"]=train["text"].apply(clean_text)
test["clean_text"]=test["text"].apply(clean_text)


 Cleaning text


Split, vectorizing and naive bayes

In [None]:
X=["clean_text"]
y=["label"].astype(int)

X_train,X_val,y_train,y_val=train_test_split(
    X,y,test_size=0.2, random_state=42,stratify=y
)

vectorizer=TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,3),
    max_features=40000,
    min_df=3
)

X_train_tfidf=vectorizer.fit_transform(X_train)
X_val_tfidf=vectorizer.transform(X_val)
X_test_tfidf=vectorizer.transform(test["clean_text"])

nb=MultinomialNB(alpha=1.0)
nb.fit(X_train_tfidf,y_train)

y_val_pred=nb.predict(X_val_tfidf)
acc=accuracy_score(y_val,y_val_pred)

print(f"\n Validation Accuracy:{acc:4f}\n")
print("classification_report:\n", classification_report(y_val,y_val_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_val,y_val_pred))

AttributeError: 'NoneType' object has no attribute 'lower'

Vectorize with Naive Bayes

In [None]:
def text_preprocessing_pipeline(text):
    # Step 1: Tokenize the text
    tokens= word_tokenize(text.lower())
    # Step 2: Remove stop words
    stop_words=set(stopwords.words('english'))
    tokens=[token for token in tokens if token not in stop_words]
    # Step 3: Remove punctuation
    tokens=[token for token in tokens if token not in string.punctuation]
    # Step 4: Apply lemmatization
    lemmatizer=WordNetLemmatizer()
    lemmatized_tokens=[lemmatizer.lemmatize(token) for token in tokens]

    return lemmatized_tokens


In [None]:
model=make_pipeline(
    TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2), #try 1-3
        min_df=2,
        max_df=0.9,
        max_features=50000
    ),
    MultinomialNB(alpha=0.5) #try 0.1 and 1.0
)

model.fit(X_train,y_train)

NameError: name 'make_pipeline' is not defined

Evaluate on validation set

In [None]:
y_pred=model.predict(X_val)
print("Validation accuracy:", round(accuracy_score(y_val,y_pred),4))
print(classification_report(y_val,y_pred,digits=3))
print(confusion_matrix(y_val,y_pred))


Now train on all data and predict test

In [None]:
#retrain on full training data
model.fit(train_df["text"], train_df["label"])

#Predict labels for test
test_pred=model.predict(test_df["text"]).astype(int)

