<a href="https://colab.research.google.com/github/AadrikaJaiswal/Apna-Guide-Tasks/blob/main/Disaster_or_Not.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install nltk



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# Load the datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

# Function to clean text data
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabet characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

# Apply text preprocessing
train_data['tweet'] = train_data['tweet'].apply(preprocess_text)
test_data['tweet'] = test_data['tweet'].apply(preprocess_text)

# Fill missing values with empty strings
train_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)

# Encode categorical features
encoder = LabelEncoder()
train_data['keyword'] = encoder.fit_transform(train_data['keyword'])
test_data['keyword'] = encoder.transform(test_data['keyword'])
train_data['place'] = encoder.fit_transform(train_data['place'])

# Handle unseen labels in test data for 'place'
unseen_labels = set(test_data['place']) - set(encoder.classes_)
if unseen_labels:
    print(f"Unseen labels found in test data: {unseen_labels}")
    # Replace unseen labels with a default value
    test_data['place'] = test_data['place'].apply(lambda x: "Unknown" if x in unseen_labels else x)

# Encode 'place' feature in the test data
test_data['place'] = encoder.transform(test_data['place'])

# Vectorize the tweet text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_data['tweet'])
X_test_tfidf = vectorizer.transform(test_data['tweet'])

# Combine TF-IDF features with 'keyword' and 'place' features
X_train = np.hstack((X_train_tfidf.toarray(), train_data[['keyword', 'place']].values))
X_test = np.hstack((X_test_tfidf.toarray(), test_data[['keyword', 'place']].values))
y_train = train_data['disaster']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the RandomForestClassifier model
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Validate the model
y_val_pred = classifier.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# Make predictions on the test data
test_predictions = classifier.predict(X_test)

# Create the submission file
submission = pd.DataFrame({'id': test_data['id'], 'disaster': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Validation Accuracy: 0.7721602101116218
              precision    recall  f1-score   support

           0       0.77      0.87      0.81       874
           1       0.78      0.65      0.71       649

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523

Submission file created: submission.csv
