In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import pos_tag
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import gensim.downloader as api
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Step 2: Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

# Initialize Stemmer and Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 3: Define text cleaning functions
def clean_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = text.lower()
    return text

# Apply Stemming, Lemmatization, Stopword Removal
def preprocess_text(text):
    text = clean_text(text)
    words = word_tokenize(text)
    # Apply Lemmatization and Stemming
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]
    return ' '.join(words)

train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# Step 4: N-grams (Unigrams, Bigrams, Trigrams)
def extract_ngrams(text, n=2):
    tokens = word_tokenize(text)
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

# Example: Adding bigrams and trigrams as features
train_df['bigrams'] = train_df['text'].apply(lambda x: extract_ngrams(x, n=2))
train_df['trigrams'] = train_df['text'].apply(lambda x: extract_ngrams(x, n=3))

test_df['bigrams'] = test_df['text'].apply(lambda x: extract_ngrams(x, n=2))
test_df['trigrams'] = test_df['text'].apply(lambda x: extract_ngrams(x, n=3))

# Step 5: Data splitting
X = train_df['text']
y = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: TF-IDF with N-grams
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 3))  # Using bigrams and trigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 7: Handle class imbalance using resampling
def balance_classes(df):
    df_majority = df[df['target'] == 1]
    df_minority = df[df['target'] == 0]
    df_minority_upsampled = resample(
        df_minority,
        replace=True,
        n_samples=len(df_majority),
        random_state=42
    )
    return pd.concat([df_majority, df_minority_upsampled])

df_upsampled = balance_classes(train_df)
X_upsampled = df_upsampled['text']
y_upsampled = df_upsampled['target']
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(
    X_upsampled, y_upsampled, test_size=0.2, random_state=42)
X_train_tfidf_upsampled = tfidf.fit_transform(X_train_upsampled)
X_test_tfidf_upsampled = tfidf.transform(X_test_upsampled)

# Step 8: Logistic Regression Model
model_lr = LogisticRegression(max_iter=200)
model_lr.fit(X_train_tfidf_upsampled, y_train_upsampled)
y_pred_lr = model_lr.predict(X_test_tfidf_upsampled)

# Evaluate Logistic Regression
print("Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_upsampled, y_pred_lr):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test_upsampled, y_pred_lr)}")
print(f"Classification Report:\n{classification_report(y_test_upsampled, y_pred_lr)}")

# Step 9: Support Vector Classifier (SVC)
model_svc = SVC(kernel='linear', random_state=42)
model_svc.fit(X_train_tfidf_upsampled, y_train_upsampled)
y_pred_svc = model_svc.predict(X_test_tfidf_upsampled)

# Evaluate SVC
print("Support Vector Classifier Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_upsampled, y_pred_svc):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test_upsampled, y_pred_svc)}")
print(f"Classification Report:\n{classification_report(y_test_upsampled, y_pred_svc)}")

# Step 10: Random Forest Classifier
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_tfidf_upsampled, y_train_upsampled)
y_pred_rf = model_rf.predict(X_test_tfidf_upsampled)

# Evaluate Random Forest
print("Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_upsampled, y_pred_rf):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test_upsampled, y_pred_rf)}")
print(f"Classification Report:\n{classification_report(y_test_upsampled, y_pred_rf)}")

# Step 11: Plot Confusion Matrices
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Disaster', 'Disaster'], yticklabels=['Not Disaster', 'Disaster'])
    plt.title(f'Confusion Matrix for {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Plot for each model
plot_confusion_matrix(y_test_upsampled, y_pred_lr, "Logistic Regression")
plot_confusion_matrix(y_test_upsampled, y_pred_svc, "Support Vector Classifier")
plot_confusion_matrix(y_test_upsampled, y_pred_rf, "Random Forest")

# Step 12: Predict outcomes on test data using Random Forest Model
new_tweets = test_df['text']
X_test_tfidf_testing = tfidf.transform(new_tweets)
predictions = model_rf.predict(X_test_tfidf_testing)

# Display predictions
for tweet, pred in zip(new_tweets[:10], predictions[:10]):
    print(f"Tweet: {tweet}")
    print(f"Prediction: {'Real Disaster' if pred == 1 else 'Not Real Disaster'}")
    print("-" * 50)

In [1]:
# Importing the neccessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import pos_tag
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import gensim.downloader as api
import numpy as np


In [None]:
# Step 1 - Loading the Dataset
