In [137]:
# handling file data
import pandas as pd
# handling numerical data
import numpy as np
# for ploting/visualisation
import matplotlib.pyplot as plt
import seaborn as sns 

# importing Natural Language Toolkit
import nltk

# data pre-processing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# transform data
from sklearn.feature_extraction.text import TfidfVectorizer
# split into train and test
from sklearn.model_selection import train_test_split
# model building
from sklearn.linear_model import LogisticRegression
# 
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB


In [138]:
train = pd.read_csv("/Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/dataset/train.csv")
test = pd.read_csv("/Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/dataset/test.csv")
submission = pd.read_csv("//Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/dataset/sample_sub.csv")

# merging datasets 
data=pd.concat([train, test], axis=0)
data.shape

(10876, 5)

In [139]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [140]:
data['length'] = data['text'].apply(lambda x: len(x))
print("Mean length of text when Disaster:",data[data['target']==1].length.mean())
print("Mean length of text when No Disaster:",data[data['target']==0].length.mean())

Mean length of text when Disaster: 108.11342097217977
Mean length of text when No Disaster: 95.70681713496084


In [141]:
# lower case
data['text']=data['text'].apply(lambda x: x.lower())

In [142]:
# filter out the words that starts with #
data["#word"] = data['text'].apply(lambda x: list(word for word in x.split() if word[0]=='#'))

# count # words in the text column
data['number_of_#words']=data['#word'].apply(lambda x: len(x))

In [143]:
# check averange numbr of #words in tweet with disaster and not disaster
print("Average number of #words in tweeets from Disaster",data[data['target']==1]['number_of_#words'].mean())
print("Average number of #words in tweeets from No Disaster",data[data['target']==0]['number_of_#words'].mean())

Average number of #words in tweeets from Disaster 0.5090186487312749
Average number of #words in tweeets from No Disaster 0.38576692768309534


In [144]:
data.head(10)

Unnamed: 0,id,keyword,location,text,target,length,#word,number_of_#words
0,1,,,our deeds are the reason of this #earthquake m...,1.0,69,[#earthquake],1
1,4,,,forest fire near la ronge sask. canada,1.0,38,[],0
2,5,,,all residents asked to 'shelter in place' are ...,1.0,133,[],0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,65,[#wildfires],1
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,88,"[#alaska, #wildfires]",2
5,8,,,#rockyfire update => california hwy. 20 closed...,1.0,110,"[#rockyfire, #cafire, #wildfires]",3
6,10,,,#flood #disaster heavy rain causes flash flood...,1.0,95,"[#flood, #disaster]",2
7,13,,,i'm on top of the hill and i can see a fire in...,1.0,59,[],0
8,14,,,there's an emergency evacuation happening now ...,1.0,79,[],0
9,15,,,i'm afraid that the tornado is coming to our a...,1.0,52,[],0


In [145]:
# tokenisation


from nltk.tokenize import word_tokenize


data['processed_text'] = data['text'].apply(lambda x : " ".join(word_tokenize(x)))

data.head()

Unnamed: 0,id,keyword,location,text,target,length,#word,number_of_#words,processed_text
0,1,,,our deeds are the reason of this #earthquake m...,1.0,69,[#earthquake],1,our deeds are the reason of this # earthquake ...
1,4,,,forest fire near la ronge sask. canada,1.0,38,[],0,forest fire near la ronge sask . canada
2,5,,,all residents asked to 'shelter in place' are ...,1.0,133,[],0,all residents asked to 'shelter in place ' are...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,65,[#wildfires],1,"13,000 people receive # wildfires evacuation o..."
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,88,"[#alaska, #wildfires]",2,just got sent this photo from ruby # alaska as...


In [146]:
# removing stopwords
stop= stopwords.words('english')

data['processed_text'] = data['processed_text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop]))

data.head()

Unnamed: 0,id,keyword,location,text,target,length,#word,number_of_#words,processed_text
0,1,,,our deeds are the reason of this #earthquake m...,1.0,69,[#earthquake],1,deeds reason # earthquake may allah forgive us
1,4,,,forest fire near la ronge sask. canada,1.0,38,[],0,forest fire near la ronge sask . canada
2,5,,,all residents asked to 'shelter in place' are ...,1.0,133,[],0,residents asked 'shelter place ' notified offi...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,65,[#wildfires],1,"13,000 people receive # wildfires evacuation o..."
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,88,"[#alaska, #wildfires]",2,got sent photo ruby # alaska smoke # wildfires...


In [147]:
# lemmatization

lem = WordNetLemmatizer()

#data['processed_text'].apply(lambda x: lem.lemmatize(x))

data['processed_text'] = data['processed_text'].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))
data.head()

Unnamed: 0,id,keyword,location,text,target,length,#word,number_of_#words,processed_text
0,1,,,our deeds are the reason of this #earthquake m...,1.0,69,[#earthquake],1,deed reason # earthquake may allah forgive u
1,4,,,forest fire near la ronge sask. canada,1.0,38,[],0,forest fire near la ronge sask . canada
2,5,,,all residents asked to 'shelter in place' are ...,1.0,133,[],0,resident asked 'shelter place ' notified offic...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,65,[#wildfires],1,"13,000 people receive # wildfire evacuation or..."
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,88,"[#alaska, #wildfires]",2,got sent photo ruby # alaska smoke # wildfire ...


from nltk.stem import PorterStemmer

stem = PorterStemmer()

data['text'].apply(lambda x: " ".join([stem.stem(word) for word in x.split()]))

In [148]:
def remove_special_characters(text):
    # Define a list of special characters to remove
    special_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"

    # Remove special characters using str.translate() method
    cleaned_text = text.translate(str.maketrans('', '', special_chars))
    return cleaned_text

data['processed_text']=data['processed_text'].apply(lambda x: remove_special_characters(x))

data.head(10)

Unnamed: 0,id,keyword,location,text,target,length,#word,number_of_#words,processed_text
0,1,,,our deeds are the reason of this #earthquake m...,1.0,69,[#earthquake],1,deed reason earthquake may allah forgive u
1,4,,,forest fire near la ronge sask. canada,1.0,38,[],0,forest fire near la ronge sask canada
2,5,,,all residents asked to 'shelter in place' are ...,1.0,133,[],0,resident asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,65,[#wildfires],1,13000 people receive wildfire evacuation orde...
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,88,"[#alaska, #wildfires]",2,got sent photo ruby alaska smoke wildfire po...
5,8,,,#rockyfire update => california hwy. 20 closed...,1.0,110,"[#rockyfire, #cafire, #wildfires]",3,rockyfire update california hwy 20 closed ...
6,10,,,#flood #disaster heavy rain causes flash flood...,1.0,95,"[#flood, #disaster]",2,flood disaster heavy rain cause flash floodi...
7,13,,,i'm on top of the hill and i can see a fire in...,1.0,59,[],0,m top hill see fire wood
8,14,,,there's an emergency evacuation happening now ...,1.0,79,[],0,s emergency evacuation happening building acro...
9,15,,,i'm afraid that the tornado is coming to our a...,1.0,52,[],0,m afraid tornado coming area


In [149]:
# negation handling

from nltk.sentiment.util import mark_negation


data['processed_text']=data['processed_text'].apply(lambda x: ''.join(mark_negation(x)))
data.head(20)

Unnamed: 0,id,keyword,location,text,target,length,#word,number_of_#words,processed_text
0,1,,,our deeds are the reason of this #earthquake m...,1.0,69,[#earthquake],1,deed reason earthquake may allah forgive u
1,4,,,forest fire near la ronge sask. canada,1.0,38,[],0,forest fire near la ronge sask canada
2,5,,,all residents asked to 'shelter in place' are ...,1.0,133,[],0,resident asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,65,[#wildfires],1,13000 people receive wildfire evacuation orde...
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,88,"[#alaska, #wildfires]",2,got sent photo ruby alaska smoke wildfire po...
5,8,,,#rockyfire update => california hwy. 20 closed...,1.0,110,"[#rockyfire, #cafire, #wildfires]",3,rockyfire update california hwy 20 closed ...
6,10,,,#flood #disaster heavy rain causes flash flood...,1.0,95,"[#flood, #disaster]",2,flood disaster heavy rain cause flash floodi...
7,13,,,i'm on top of the hill and i can see a fire in...,1.0,59,[],0,m top hill see fire wood
8,14,,,there's an emergency evacuation happening now ...,1.0,79,[],0,s emergency evacuation happening building acro...
9,15,,,i'm afraid that the tornado is coming to our a...,1.0,52,[],0,m afraid tornado coming area


# All till now in one function :P

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text



# Apply text cleaning
data['processed_text'] = data['text'].apply(clean_text)

# Display the DataFrame with cleaned text
data.head(10)


from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    corrected_words = []
    words = text.split()

    # Correct spelling of each word
    for word in words:
        corrected_word = spell.correction(word)
        corrected_words.append(corrected_word)

    corrected_text = ' '.join(corrected_words)
    return corrected_text

In [150]:
import re


def remove_repeated_characters(text):
    # Remove repeated characters using regex
    corrected_text = re.sub(r'(.)\1+', r'\1', text)
    return corrected_text

data['processed_text'].apply(remove_repeated_characters).head(20)

# not using as it removed even if a letter occurs twice contiuously in a word. 

0              ded reason earthquake may alah forgive u
1                 forest fire near la ronge sask canada
2     resident asked shelter place notified oficer e...
3     130 people receive wildfire evacuation order c...
4     got sent photo ruby alaska smoke wildfire pour...
5      rockyfire update california hwy 20 closed dir...
6      flod disaster heavy rain cause flash floding ...
7                                m top hil se fire wod 
8     s emergency evacuation hapening building acros...
9                         m afraid tornado coming area 
10                       thre people died heat wave far
11    haha south tampa geting floded hah wait second...
12     raining floding florida tampabay tampa 18 19 ...
13                        flod bago myanmar arived bago
14         damage schol bus 80 multi car crash breaking
15                                               s man 
16                                           love fruit
17                                         sumer

In [151]:
def remove_urls(text):
    pattern = r'http\S+'  # Regex pattern to match URLs
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

data['processed_text']=data['processed_text'].apply(remove_urls)

In [152]:
# split test and train

i = train.shape[0]
train = data[:i]
test = data[i:]

x=train['processed_text']
y=train['target']

In [153]:
from sklearn.model_selection import train_test_split

x_train,  x_val, y_train, y_val = train_test_split(x,y, test_size=0.2, random_state=42)

In [154]:
x_val.head()

2644           new weapon cause unimaginable destruction 
2227    f   amp   ing thing  gishwhes got soaked delug...
5448    dt  georgegalloway  rt  galloway4mayor  ûïthe...
132     aftershock back school kick great  want thank ...
6845    response trauma child addict develop defensive...
Name: processed_text, dtype: object

# text representation 

In [155]:
from sklearn.feature_extraction.text import CountVectorizer

def transformation(fe):
    print("Feature Extraction:", fe)
    fe.fit(x_train)

    x_train_fe = fe.transform(x_train).toarray()
    x_val_fe = fe.transform(x_val).toarray()

    test_fe = fe.transform(test['processed_text']).toarray()
    
    return (x_train_fe, x_val_fe, test_fe)

maxf = 3000
cv = CountVectorizer(max_features=maxf)
tfidf = TfidfVectorizer(max_features=maxf)

x_train_fe, x_val_fe, test_fe = transformation(cv)

Feature Extraction: CountVectorizer(max_features=3000)


In [156]:
test_fe

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Dimensionality Reduction Techniques:

In [157]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

def reduce_dimension(technique):
    print("Feature Reduction:", technique)
    technique.fit(x_train_fe)
    
    x_train_fe_r = technique.transform(x_train_fe)
    x_val_fe_r = technique.transform(x_val_fe)

    test_fe_r = technique.transform(test_fe)

    return( x_train_fe_r, x_val_fe_r, test_fe_r)



n=2
svd = TruncatedSVD(n_components=n)  # Specify the number of components to keep
pca = PCA(n_components=n)

x_train_fe_r, x_val_fe_r, test_fe_r = reduce_dimension(pca)


Feature Reduction: PCA(n_components=2)


# Model selection

In [158]:
# importing models

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm


In [159]:
x_train_fe

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [160]:
def modeling(model):
    print("Model: ",model)
    print()
    try:
        # Code to be executed
        model.fit(x_train_fe, y_train)

        y_train_pred = model.predict(x_train_fe)

        y_pred = model.predict(x_val_fe)

        test_pred = model.predict(test_fe)
        print("no feature reduction")

    except ValueError as ve:
        # Code to handle the ValueError exception
        print("ValueError!")
        model.fit(x_train_fe_r, y_train)

        y_train_pred = model.predict(x_train_fe_r)

        y_pred = model.predict(x_val_fe_r)

        test_pred = model.predict(test_fe_r)

    else:
        # Code to be executed if no exception occurs
        print("No exception occurred.")

    finally:
        # Code to be executed regardless of whether an exception occurred or not 
        # print("Classification Report for train set:")
        # print(classification_report(y_train_pred, y_train))

        print("Classification Report for Validation set:")
        print(classification_report(y_pred, y_val))

        # print("confusion matrix")
        # print(confusion_matrix(y_pred, y_val))

        print("\nvalue_counts of validation set predictions:\n", pd.DataFrame(y_pred).value_counts())

        print("\nvalue_counts of test predictions:\n", pd.DataFrame(test_pred).value_counts())

    return test_pred



instance_lr = LogisticRegression()
instance_dtc = DecisionTreeClassifier()
instance_rfc = RandomForestClassifier()
instance_svm = svm.SVC()
instance_mnb = MultinomialNB()


submission['target']= modeling(instance_lr)

Model:  LogisticRegression()

no feature reduction
No exception occurred.
Classification Report for Validation set:
              precision    recall  f1-score   support

         0.0       0.86      0.80      0.83       940
         1.0       0.71      0.79      0.75       583

    accuracy                           0.80      1523
   macro avg       0.79      0.80      0.79      1523
weighted avg       0.80      0.80      0.80      1523


value_counts of validation set predictions:
 0.0    940
1.0    583
dtype: int64

value_counts of test predictions:
 0.0    2080
1.0    1183
dtype: int64


In [161]:
# Features extraction
maxf = 3000
cv = CountVectorizer(max_features=maxf)
tfidf = TfidfVectorizer(max_features=maxf)
x_train_fe, x_val_fe, test_fe = transformation(tfidf)

# features reduction
n=7
svd = TruncatedSVD(n_components=n)  # Specify the number of components to keep
pca = PCA(n_components=n)

x_train_fe_r, x_val_fe_r, test_fe_r = reduce_dimension(pca)

# model train and prediction
instance_lr = LogisticRegression()
instance_dtc = DecisionTreeClassifier()
instance_rfc = RandomForestClassifier(n_estimators=500, random_state=42)
instance_svm = svm.SVC()
instance_mnb = MultinomialNB()


submission['target']= modeling(instance_lr)

Feature Extraction: TfidfVectorizer(max_features=3000)
Feature Reduction: PCA(n_components=7)
Model:  LogisticRegression()

no feature reduction
No exception occurred.
Classification Report for Validation set:
              precision    recall  f1-score   support

         0.0       0.88      0.79      0.84       972
         1.0       0.69      0.81      0.75       551

    accuracy                           0.80      1523
   macro avg       0.79      0.80      0.79      1523
weighted avg       0.81      0.80      0.80      1523


value_counts of validation set predictions:
 0.0    972
1.0    551
dtype: int64

value_counts of test predictions:
 0.0    2118
1.0    1145
dtype: int64


In [162]:
submission['target'] = submission['target'].apply(lambda x: round(x))
#submission.to_csv("/Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/subission file/submission_7.csv", index=False)
print(submission.target.value_counts())
submission.head()

0    2118
1    1145
Name: target, dtype: int64


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

-# Create a pipeline with TfidfVectorizer, PCA, and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD()),
    ('classifier', RandomForestClassifier())
])

-# Define the parameter grid for grid search
parameters = {
    'svd__n_components': range(2, 101, 5),         # Values for n_components in TruncatedSVD
    'tfidf__max_features': range(500, 1001, 100),  # Values for max_features in TfidfVectorizer
    'classifier__n_estimators': [100, 200, 300]    # Values for n_estimators in RandomForestClassifier
}

-# Create an instance of GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=3)

-# Perform grid search on the corpus
grid_search.fit(x_train, y_train)

-# Get the best parameters and the corresponding score
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_parameters)
print("Best Score:", best_score)


In [174]:
# split test and train

i = train.shape[0]
train = data[:i]
test = data[i:]

from sklearn.metrics import accuracy_score

# Split the data into features (text and length) and target
X = train[['processed_text', 'length','number_of_#words']]
y = train['target']
test = test[['processed_text','length','number_of_#words']]

# Convert text to numerical representation using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000)
vectorizer.fit(X['processed_text'])
X_text = vectorizer.transform(X['processed_text'])
test_text = vectorizer.transform(test['processed_text'])

# Combine text and length features
X_combined = pd.concat([pd.DataFrame(X_text.toarray()), X[['length','number_of_#words']]], axis=1)
test_combined = pd.concat([pd.DataFrame(test_text.toarray()), test[['length','number_of_#words']]], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Build and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
submission['target']=model.predict(test_combined)

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)





Accuracy: 0.8076165462902167


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [164]:
submission['target']=model.predict(test_combined)



In [165]:
submission['target'] = submission['target'].apply(lambda x: round(x))
#submission.to_csv("/Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/subission file/submission_9.csv", index=False)
print(submission.target.value_counts())
submission.head()

0    2087
1    1176
Name: target, dtype: int64


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
