<a href="https://colab.research.google.com/github/Akashchavan01041997/TEST-PROJECT/blob/main/disasterfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!pip install pandas nltk scikit-learn imbalanced-learn gensim textblob
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTETomek
from gensim.models import Word2Vec
from gensim.downloader import load
from textblob import TextBlob

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Data Loading and Preprocessing
df = pd.read_csv('/content/train.csv')

# Handling missing values
imputer = SimpleImputer(strategy='most_frequent')
df[['keyword', 'location']] = imputer.fit_transform(df[['keyword', 'location']])

stop_words = set(stopwords.words('english'))
stop_words.update(['rt', 'amp', 'via'])  # Add custom stopwords
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'[^a-zA-Z\s#]', '', text)  # Keep hashtags
        text = text.lower()
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return tokens
    else:
        return []

df['processed_text'] = df['text'].apply(preprocess_text)

# Add new features
df['text_length'] = df['text'].apply(len)
df['num_hashtags'] = df['text'].apply(lambda x: x.count('#'))
df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# 2. Word2Vec Embedding with Pre-trained Model
# Load pre-trained Word2Vec model (glove-twitter-25)
#model = Word2Vec(df['processed_text'].tolist(), vector_size=200, window=5, min_count=2)
#model.save("word2vec.model")
#model = Word2Vec.load("word2vec.model")
#model = api.load('word2vec-google-news-300')
model = load('glove-twitter-25', return_path=False)

# 3. Feature Extraction
def get_document_vector(tokens, model):
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['document_vector'] = df['processed_text'].apply(lambda tokens: get_document_vector(tokens, model))

# Combine new features with document vector
additional_features = df[['text_length', 'num_hashtags', 'sentiment']].to_numpy()
X_vectors = np.array(df['document_vector'].tolist())
X_combined = np.hstack((X_vectors, additional_features))

y = df['target']

# Addressing class imbalance with SMOTETomek
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_combined, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 4. Model Training with RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)[:, 1]  # For ROC-AUC

# Evaluate performance with multiple metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"ROC-AUC: {roc_auc}")

# 5. Predicting New Tweets
new_tweet = "Forest fire near La Ronge Sask. Canada"

# Preprocess the new tweet
processed_tweet = preprocess_text(new_tweet)
tweet_vector = get_document_vector(processed_tweet, model)

# Add additional features for the new tweet
text_length = len(new_tweet)
num_hashtags = new_tweet.count('#')
sentiment = TextBlob(new_tweet).sentiment.polarity
new_tweet_combined = np.hstack((tweet_vector, [text_length, num_hashtags, sentiment]))

# Make the prediction
prediction = classifier.predict([new_tweet_combined])[0]

# Print the prediction
if prediction == 1:
    print(f"Tweet: '{new_tweet}' is predicted as a real disaster.")
else:
    print(f"Tweet: '{new_tweet}' is predicted as a fake disaster.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.8206686930091185
Precision: 0.8402061855670103
Recall: 0.7922235722964763
F1-score: 0.815509693558474
ROC-AUC: 0.8868332283823055
Tweet: 'Forest fire near La Ronge Sask. Canada' is predicted as a real disaster.


1. Importing Libraries and Resources

In [12]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer  # For handling missing values
from imblearn.over_sampling import SMOTE  # For handling class imbalance

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

2. Data Loading and Preprocessing

In [13]:
# 1. Data Loading and Preprocessing
df = pd.read_csv('/content/train.csv')

# Handling missing values with imputation
imputer = SimpleImputer(strategy='most_frequent')  # Replace with your preferred strategy
df[['keyword', 'location']] = imputer.fit_transform(df[['keyword', 'location']])

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphanumeric characters
        text = text.lower()
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatization
        return tokens
    else:
        return []

df['processed_text'] = df['text'].apply(preprocess_text)

3. Word2Vec Embedding with Hyperparameter Tuning

In [14]:
# 2. Word2Vec Embedding with Hyperparameter Tuning
corpus = df['processed_text'].tolist()

# Hyperparameter tuning for Word2Vec (example)
word2vec_params = {
    'vector_size': [100, 200, 300],
    'window': [5, 7, 9],
    'min_count': [1, 3, 5]
}

# Use GridSearchCV to find the best hyperparameters for Word2Vec
# ... (Code for GridSearchCV with Word2Vec) ...

# After tuning, create the Word2Vec model with the best parameters
model = Word2Vec(corpus, vector_size=200, window=7, min_count=3, workers=4, sg=1)  # Replace with best parameters

4. Feature Extraction

In [15]:
# 3. Feature Extraction
def get_document_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['document_vector'] = df['processed_text'].apply(lambda tokens: get_document_vector(tokens, model))

5. Model Training and Evaluation

In [16]:
# 4. Model Training and Evaluation
X = np.array(df['document_vector'].tolist())
y = df['target']

# Addressing class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForestClassifier (example)
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10]
}

# Use GridSearchCV to find the best hyperparameters for RandomForestClassifier
# ... (Code for GridSearchCV with RandomForestClassifier) ...

# After tuning, create the RandomForestClassifier model with the best parameters
classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Replace with best parameters
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Evaluate performance with multiple metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")



Accuracy: 0.7829591249280369
Precision: 0.8161209068010076
Recall: 0.7372013651877133
F1-score: 0.7746563060370592


6. Example Prediction

In [17]:
# Example new tweet for prediction:
new_tweet = "There's a huge fire near my house, and people are evacuating!"

# Preprocess the new tweet:
processed_tweet = preprocess_text(new_tweet)
tweet_vector = get_document_vector(processed_tweet, model)

# Make the prediction:
prediction = classifier.predict([tweet_vector])[0]  # Get the prediction (0 or 1)

# Print the prediction:
if prediction == 1:
    print(f"Tweet: '{new_tweet}' is predicted as a real disaster.")
else:
    print(f"Tweet: '{new_tweet}' is predicted as a fake disaster.")

Tweet: 'There's a huge fire near my house, and people are evacuating!' is predicted as a real disaster.
