In [16]:
import pandas as pd

# Load the training dataset
train_df = pd.read_csv('train1 .csv')
test_df = pd.read_csv('test1.csv')

# Load the test dataset (upload the file if needed)
# test_df = pd.read_csv('/mnt/data/test.csv')

# Display the first few rows of the training dataset
train_df.head()



Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [17]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean the text
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply cleaning to the training and test datasets
train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit on training data and transform both training and test data
X_train = tfidf.fit_transform(train_df['cleaned_text'])
X_test = tfidf.transform(test_df['cleaned_text'])

# Convert target to numpy array for training
y_train = train_df['target'].values


In [12]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Split the training data for internal validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC()
}

# Train and evaluate models on the split data
for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    scores = cross_val_score(model, X_train_split, y_train_split, cv=5)
    print(f"{name} - Cross-Validation Accuracy: {scores.mean():.4f}")


Naive Bayes - Cross-Validation Accuracy: 0.7972
Logistic Regression - Cross-Validation Accuracy: 0.7979
Support Vector Machine - Cross-Validation Accuracy: 0.7926


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate each model
for name, model in models.items():
    y_pred = model.predict(X_val_split)
    print(f"{name} Validation Performance:")
    print(f"Accuracy: {accuracy_score(y_val_split, y_pred):.4f}")
    print(f"Precision: {precision_score(y_val_split, y_pred):.4f}")
    print(f"Recall: {recall_score(y_val_split, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_val_split, y_pred):.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_val_split, y_pred)}\n")

# Predict on the actual test data (when test data is available)
# y_test_pred = model.predict(X_test)


Naive Bayes Validation Performance:
Accuracy: 0.7991
Precision: 0.8147
Recall: 0.6841
F1-Score: 0.7437
Confusion Matrix:
[[773 101]
 [205 444]]

Logistic Regression Validation Performance:
Accuracy: 0.7997
Precision: 0.8209
Recall: 0.6780
F1-Score: 0.7426
Confusion Matrix:
[[778  96]
 [209 440]]

Support Vector Machine Validation Performance:
Accuracy: 0.7932
Precision: 0.8224
Recall: 0.6564
F1-Score: 0.7301
Confusion Matrix:
[[782  92]
 [223 426]]

