In [None]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


import nltk

import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')

import regex as re

from sklearn.preprocessing import LabelEncoder

from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords


import spacy
from nltk.stem import WordNetLemmatizer



# Scipy
import scipy


# Train-test split and cross validation
from sklearn.model_selection import train_test_split, ParameterGrid

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB        
# from sklearn.naive_bayes import MultinomialNB     
from sklearn.naive_bayes import BernoulliNB     
from sklearn.naive_bayes import CategoricalNB  

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


## Problem Statement

*** Data Gathering ***

In [None]:
dfsample=pd.read_csv('/content/drive/MyDrive/BW_projects/sample_submission.csv')
dfsample

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dftrain=pd.read_csv('/content/drive/MyDrive/BW_projects/train.csv')
dftrain.head()


In [None]:
dftest=pd.read_csv('/content/drive/MyDrive/BW_projects/test.csv')
dftest

Data Cleaning

In [None]:
dftrain.info()

In [None]:
dftrain.head(2)

In [None]:
dftrain.sample(5)

In [None]:
#checking for missing values
null_counts = dftrain.isnull().sum()
null_percentage = (null_counts / len(dftrain)) * 100
sns.barplot(x=null_percentage.index, y=null_percentage)
plt.xticks(rotation=90)
plt.show()

In [None]:
dftrain['keyword'] = dftrain['keyword'].fillna('unknown')
dftrain['location'] = dftrain['location'].fillna('unknown')

In [None]:
print(dftrain[['keyword', 'location']].isnull().sum())

In [None]:
plt.figure(figsize=(5, 4))
null_counts = dftrain.isnull().sum()
null_percentage = (null_counts / len(dftrain)) * 100
sns.barplot(x=null_percentage.index, y=null_percentage)
plt.xticks(rotation=90)
plt.show()

In [None]:
#checking for duplicate values
dftrain.duplicated().sum()

In [None]:
dftrain.shape

In [None]:
dftest.isnull().sum()

## **EDA**

dftrain['target'].value_counts()

In [None]:
plt.pie(dftrain['target'].value_counts(), labels=['disaster','not_disaster'],autopct="%0.2f",colors='yc',explode=[0.3,0])
plt.show()

In [None]:
#checking strenght of characters in each class
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len= dftrain[dftrain['target']==1]['text'].str.len()
ax1.hist(tweet_len,color='red')
ax1.set_title('disaster tweets')
tweet_len=dftrain[dftrain['target']==0]['text'].str.len()
ax2.hist(tweet_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')
plt.show()

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
# num of words
dftrain['words_count']=dftrain['text'].apply(lambda x:len(nltk.word_tokenize(x)))

total_words = dftrain['words_count'].sum()
dftrain['words_percentage'] = (dftrain['words_count'] / total_words) * 100

#Spliting into disaster and non-disaster tweets
dftraindisa = dftrain[dftrain['target'] == 1]['words_percentage']
dftrainnondisa = dftrain[dftrain['target'] == 0]['words_percentage']

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
ax1.violinplot(dftraindisa,showmeans=True)
ax1.set_title('disaster tweets')
ax2.violinplot(dftrainnondisa,showmeans=True)
ax2.set_title('Not disaster tweets')
fig.suptitle('words in tweets')
plt.show()

## Text Processing

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# stopwords.words('english')

In [None]:
import string

In [None]:
# Converting to lowercase
def convert_to_lowercase(text):
    return text.lower()

text = "This is a FUNCTION that CoNvErTs a Text to lowercase"
print("Input: {}".format(text))
print("Output: {}".format(convert_to_lowercase(text)))

In [None]:
# Removing whitespaces
def remove_whitespace(text):
    return text.strip()

text = " \t This is a string \t "
print("Input: {}".format(text))
print("Output: {}".format(remove_whitespace(text)))

In [None]:
# Removing punctuations
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
    return text.translate(str.maketrans("", "", punct_str))

text = "Here's [an] example? {of} &a string. with.? punctuations!!!!"
print("Input: {}".format(text))
print("Output: {}".format(remove_punctuation(text)))

In [None]:
# Removing HTML tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

text = '<a href = "https://www.online.com"> tweet Classification </a>'
print("Input: {}".format(text))
print("Output: {}".format(remove_html(text)))


In [None]:
# Removing other unicode characters
def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, "", text)

text = "It's a function that removes links starting with http: or https such as https://en.wikipedia.org/wiki/Unicode_symbols"
print("Input: {}".format(text))
print("Output: {}".format(remove_http(text)))

In [None]:
# Stopwords
stops = stopwords.words("english") # stopwords
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
allstops = stops + addstops

print(allstops)

In [None]:
# Function to remove stopwords from a list of texts
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])

text = "This is a function that removes stopwords in a given text"
print("Input: {}".format(text))
print("Output: {}".format(remove_stopwords(text)))

In [None]:
# Lemmatization
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
#lemmatizer = WordNetLemmatizer()

def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    #text_wordnet = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]) # regexp.tokenize(text)
    return text_spacy
    #return text_wordnet

text = "Introducing lemmatization as an improvement over stemming"
print("Input: {}".format(text))
print("Output: {}".format(text_lemmatizer(text)))

In [None]:
regexp = RegexpTokenizer(r'\w+') # Initialize the RegexpTokenizer

In [None]:
dftrain['transformed_text'] = (dftrain["text"].apply(convert_to_lowercase)
                                     .apply(remove_punctuation) 
                                     .apply(remove_whitespace)
                                     .apply(remove_html)
                                     .apply(remove_http)
                                     .apply(remove_stopwords)
                                     .apply(text_lemmatizer)
                                     )

In [None]:
dftrain['tokens'] = (dftrain["text"].apply(convert_to_lowercase)
                                    .apply(remove_punctuation) 
                                    .apply(remove_whitespace)
                                    .apply(remove_html)
                                    .apply(remove_http)
                                    .apply(remove_stopwords)
                                    .apply(regexp.tokenize))


In [None]:
dftrain.head(2)

### Word2Vec / Vectorisation

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='black')

In [None]:
Disaster_wc = wc.generate(dftrain[dftrain['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(Disaster_wc)

In [None]:
Nondisaster_wc = wc.generate(dftrain[dftrain['target'] == 0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(Nondisaster_wc)

In [None]:
Disaster_corpus = []
for msg in dftrain[dftrain['target'] == 1]['tokens']:
    for word in msg:
        Disaster_corpus.append(word)

In [None]:
len(Disaster_corpus)

In [None]:
from collections import Counter
comDisa=pd.DataFrame(Counter(Disaster_corpus).most_common(30))[0]
comDisa
comDisa_count=pd.DataFrame(Counter(Disaster_corpus).most_common(30))[1]
comDisa_count
sns.barplot(x=comDisa,y=comDisa_count)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
NonDisaster_corpus = []
for msg in dftrain[dftrain['target'] == 0]['tokens']:
    for word in msg:
        NonDisaster_corpus.append(word)

In [None]:
len(NonDisaster_corpus)

In [None]:
comnon=pd.DataFrame(Counter(NonDisaster_corpus).most_common(30))[0]
comnon
comnon_count=pd.DataFrame(Counter(NonDisaster_corpus).most_common(30))[1]
comnon_count
sns.barplot(x=comnon,y=comnon_count)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
dftrain.head(2)

In [None]:
dftrain=dftrain[['tokens','target']]
dftrain.head(2)

In [None]:
# Text Vectorization
# using word2vec
from gensim.models import Word2Vec

In [None]:
w2v = Word2Vec(
    sentences=dftrain['tokens'],  # Tokenized sentences
    vector_size=200,         # Size of the embedding vectors
    window=5,                # Context window size
    min_count=1,             # Minimum frequency of words
    workers=4,               # Number of CPU cores to use
)


In [None]:
w2v

In [None]:
# 2. Train Word2Vec model
# model = Word2Vec(sentences, vector_size=200, window=5, min_count=1,sg=0, workers=4)

# 3. Generate sentence vectors (document embeddings)
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

dftrain['sentence_vector'] = dftrain['tokens'].apply(lambda x: get_sentence_vector(x, w2v))

In [None]:
# 4. Prepare data for classification
X = np.stack(dftrain['sentence_vector'].values)
y = dftrain['target'].values  # Assuming 'target' is your target variable column

In [None]:
# 5. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=True)

In [None]:
# 6. Train a classifier (e.g., Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [None]:
# 7. Make predictions on the test set
y_pred = classifier.predict(X_test)

In [None]:
# 8. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

In [None]:
# 3. Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Create classifier
rf_classifier.fit(X_train, y_train)                                       # Train the classifier
y_pred_rf = rf_classifier.predict(X_test)                                   # Make predictions


In [None]:
# 4. Evaluate Random Forest
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

### NAIVE BAYES 

In [None]:
#  Naive Bayes Classifier (GaussianNB)
nb_classifier = GaussianNB()              # Create classifier
nb_classifier.fit(X_train, y_train)        # Train the classifier
y_pred_nb = nb_classifier.predict(X_test)  # Make predictions

#  Evaluate Naive Bayes
print("\nNaive Bayes Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))


In [None]:
#  Bernoulli Naive Bayes
bnb_classifier = BernoulliNB()      # Create classifier
bnb_classifier.fit(X_train, y_train) # Train the classifier
y_pred_bnb = bnb_classifier.predict(X_test)  # Make predictions

# Evaluate Bernoulli Naive Bayes
print("\nBernoulli Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb))