In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import nltk

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test (1).csv')

In [None]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [None]:
# Apply preprocessing
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

In [None]:
# Handle missing values in 'keyword' and 'location'
train_df['keyword'] = train_df['keyword'].fillna('none')
test_df['keyword'] = test_df['keyword'].fillna('none')

In [None]:
# Combine 'keyword' and 'cleaned_text'
train_df['combined_text'] = train_df['keyword'] + ' ' + train_df['cleaned_text']
test_df['combined_text'] = test_df['keyword'] + ' ' + test_df['cleaned_text']

NameError: name 'train_df' is not defined

In [None]:
# Define features and target
X = train_df['combined_text']
y = train_df['target']

In [None]:
X.head()

Unnamed: 0,combined_text
0,none deed reason earthquake may allah forgive u
1,none forest fire near la ronge sask canada
2,none resident asked shelter place notified off...
3,none people receive wildfire evacuation order ...
4,none got sent photo ruby alaska smoke wildfire...


In [None]:
y.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


In [None]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=500, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(kernel='linear', random_state=42)
}

In [None]:
# Hyperparameter tuning for Logistic Regression
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l2'],
    'tfidf__max_features': [5000, 10000, 20000],
    'tfidf__ngram_range': [(1, 1), (1, 2)]
}

In [None]:
# Train and evaluate models
def train_and_evaluate():
    best_pipeline = None
    best_model_name = ''
    best_accuracy = 0

    for name, model in models.items():
        print(f"\nTraining {name}...")
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', model)
        ])
        if name == 'Logistic Regression':
            grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
            print("Best Params:", grid_search.best_params_)
        else:
            pipeline.fit(X_train, y_train)
            best_pipeline = pipeline

        y_val_pred = best_pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        print(f"{name} Accuracy: {accuracy:.2%}")
        print(f"Classification Report for {name}:\n", classification_report(y_val, y_val_pred))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name

    print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.2%}")
    return best_pipeline

best_pipeline = train_and_evaluate()


Training Logistic Regression...
Best Params: {'clf__C': 1, 'clf__penalty': 'l2', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Logistic Regression Accuracy: 81.55%
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       869
           1       0.83      0.72      0.77       654

    accuracy                           0.82      1523
   macro avg       0.82      0.80      0.81      1523
weighted avg       0.82      0.82      0.81      1523


Training Random Forest...
Random Forest Accuracy: 78.79%
Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.77      0.91      0.83       869
           1       0.84      0.63      0.72       654

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.77      1523
weighted avg       0.80      0.79      0.78      1523


Training Support Ve

In [None]:
# Make predictions on the test set
test_df['predictions'] = best_pipeline.predict(test_df['combined_text'])

In [None]:
print(test_df[['id', 'combined_text', 'predictions']].head())

   id                                      combined_text  predictions
0   0                   none happened terrible car crash            0
1   2  none heard earthquake different city stay safe...            1
2   3  none forest fire spot pond goose fleeing acros...            1
3   9          none apocalypse lighting spokane wildfire            1
4  11            none typhoon soudelor kill china taiwan            1


In [None]:
# Save submission
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['target'] = test_df['predictions']
submission_path = 'disaster_predictions_submission.csv'
sample_submission.to_csv(submission_path, index=False)
print(f"Submission file saved to {submission_path}")