In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [59]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [60]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [61]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [62]:
import re
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    text = re.sub(r'@\w+', '', text)
    doc = nlp(text)
    
    no_stop_words = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            no_stop_words.append(token.text)
    
    return " ".join(no_stop_words)
preprocess("Hi @karen_langat, have you watched the new harry potter series?")

'Hi watched new harry potter series'

In [63]:
# train_df['text_vectors'] = train_df['text'].apply(preprocess)
# train_df['text_vectors'] = train_df['text'].apply(lambda x: nlp(x).vector)

In [64]:
X = train_df['text_vectors']
y = train_df['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=)

In [65]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf = MultinomialNB()

clf.fit(scaled_train_embed, y_train)

In [67]:
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.96      0.73      1318
           1       0.57      0.07      0.12       966

    accuracy                           0.58      2284
   macro avg       0.58      0.51      0.42      2284
weighted avg       0.58      0.58      0.47      2284



In [68]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred)

print("F1 Score:", f1)

F1 Score: 0.1171003717472119


### TF-IDF

In [73]:
train_df['text'] = train_df['text'].apply(preprocess)

In [74]:
X = train_df['text']
y = train_df['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf.fit(X_train, y_train)

In [76]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82      1318
           1       0.81      0.61      0.70       966

    accuracy                           0.77      2284
   macro avg       0.79      0.75      0.76      2284
weighted avg       0.78      0.77      0.77      2284



In [77]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred)

print("F1 Score:", f1)

F1 Score: 0.6951364175563464
