In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dixit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dixit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
train_df = pd.read_csv('train_E6oV3lV.csv')

In [5]:
X_train, X_val, y_train, y_val = train_test_split(train_df['tweet'], train_df['label'], test_size=0.2, random_state=42)

stop_words = set(stopwords.words('english'))

def tokenize_tweet(tweet):
    tokens = word_tokenize(tweet.lower())
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

X_train_tokenized = X_train.apply(tokenize_tweet)
X_val_tokenized = X_val.apply(tokenize_tweet)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_tokenized)
X_val_tfidf = vectorizer.transform(X_val_tokenized)

In [6]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_val_tfidf)

In [8]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_nb))

Validation metrics:
Accuracy: 0.9508837791334271


In [9]:
print("Classification Report:")
print(classification_report(y_val, y_pred_nb))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.94      0.33      0.49       456

    accuracy                           0.95      6393
   macro avg       0.95      0.66      0.73      6393
weighted avg       0.95      0.95      0.94      6393



In [10]:
test_df = pd.read_csv('test_tweets_anuFYb8.csv')

In [11]:
X_test_tokenized = test_df['tweet'].apply(tokenize_tweet)
X_test_tfidf = vectorizer.transform(X_test_tokenized)

In [20]:
y_pred_test_nb = nb.predict(X_test_tfidf)

In [22]:
from sklearn.metrics import f1_score
y_pred = nb.predict(X_val_tfidf)
y_val_true = y_val

f1 = f1_score(y_val_true, y_pred)
print("Validation F1-Score Naive Bayes:", f1)

Validation F1-Score Logistic Reg: 0.49025974025974023


In [25]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1)
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_val_tfidf)

In [26]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_svm))

Validation metrics:
Accuracy: 0.9582355701548568


In [27]:
print("Classification Report:")
print(classification_report(y_val, y_pred_svm))

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5937
           1       0.88      0.48      0.62       456

    accuracy                           0.96      6393
   macro avg       0.92      0.74      0.80      6393
weighted avg       0.96      0.96      0.95      6393



In [30]:
from sklearn.metrics import f1_score
y_pred = svm.predict(X_val_tfidf)
y_val_true = y_val

f1 = f1_score(y_val_true, y_pred)
print("Validation F1-Score Naive Bayes:", f1)

Validation F1-Score Naive Bayes: 0.620199146514936


In [31]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_val_tfidf)

In [32]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_rf))

Validation metrics:
Accuracy: 0.9580791490692946


In [33]:
print("Classification Report:")
print(classification_report(y_val, y_pred_rf))

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5937
           1       0.86      0.49      0.63       456

    accuracy                           0.96      6393
   macro avg       0.91      0.74      0.80      6393
weighted avg       0.95      0.96      0.95      6393



In [34]:
from sklearn.metrics import f1_score
y_pred = rf.predict(X_val_tfidf)
y_val_true = y_val

f1 = f1_score(y_val_true, y_pred)
print("Validation F1-Score Naive Bayes:", f1)

Validation F1-Score Naive Bayes: 0.6256983240223464


In [36]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train_tfidf, y_train)
y_pred_gb = gb.predict(X_val_tfidf)

In [37]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_gb))

Validation metrics:
Accuracy: 0.9457218833098702


In [38]:
print("Classification Report:")
print(classification_report(y_val, y_pred_gb))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.89      0.27      0.42       456

    accuracy                           0.95      6393
   macro avg       0.92      0.63      0.69      6393
weighted avg       0.94      0.95      0.93      6393



In [39]:
from sklearn.metrics import f1_score
y_pred = gb.predict(X_val_tfidf)
y_val_true = y_val

f1 = f1_score(y_val_true, y_pred)
print("Validation F1-Score Naive Bayes:", f1)

Validation F1-Score Naive Bayes: 0.41680672268907565


In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp.fit(X_train_tfidf, y_train)
y_pred_mlp = mlp.predict(X_val_tfidf)

In [None]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_mlp))

In [None]:
print("Classification Report:")
print(classification_report(y_val, y_pred_mlp))