<a href="https://colab.research.google.com/github/BaberFaisal/Natural-Language-Processing-with-Disaster-Tweets_using_-logistic-and-SVM/blob/main/Natural_Language_Processing_with_Disaster_Tweets_using__logistic_and_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
import os
nltk_data_path = "/root/nltk_data"
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

nltk.data.path.append(nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('punkt_tab', download_dir=nltk_data_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Load dataset
df = pd.read_csv('/content/train (1).csv')

In [None]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [None]:
# Preprocessing the text data
def preprocess_text(text):
    """
    Cleans and preprocesses text data.

    Steps:
    1. Convert text to lowercase to ensure uniformity.
    2. Remove URLs to clean the text.
    3. Remove punctuation to reduce noise.
    4. Tokenize text into words.
    5. Remove stopwords to focus on meaningful words.
    6. Apply lemmatization to get the root form of words.
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)


In [None]:
# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [None]:
df

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,two giant crane holding bridge collapse nearby...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,ariaahrary thetawniest control wild fire calif...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m194 0104 utc5km volcano hawaii
7611,10872,,,Police investigating after an e-bike collided ...,1,police investigating ebike collided car little...


In [None]:
# Step 2: Feature Engineering
# Convert text into numerical features using Bag of Words (BoW) and TF-IDF

# BoW Representation
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(df['cleaned_text'])

# TF-IDF Representation
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_text'])

In [None]:
# Step 3: Model Training & Hyperparameter Tuning
# Prepare data for training
y = df['target']  # Assuming 'target' column contains labels
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [None]:
# Train models
log_reg = LogisticRegression()
svm = SVC()

log_reg.fit(X_train_tfidf, y_train)
svm.fit(X_train_tfidf, y_train)

In [None]:
# Step 4: Model Evaluation
y_pred_log_reg = log_reg.predict(X_test_tfidf)
y_pred_svm = svm.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

Logistic Regression Accuracy: 0.7931713722915299
SVM Accuracy: 0.7964543663821405


In [None]:
# Step 5: Hyperparameter tuning for Logistic Regression
param_grid = {'C': [0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)
best_model = grid_search.best_estimator_

In [None]:
# Step 6: Final prediction for Kaggle submission
test_df = pd.read_csv('/content/test (1).csv')  # Assuming test data is 'test.csv'
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)
X_test_final = vectorizer_tfidf.transform(test_df['cleaned_text'])
test_predictions = best_model.predict(X_test_final)
