In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhumu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bhumu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Load the dataset

In [2]:
dataset = pd.read_csv(r"C:\Users\bhumu\Downloads\nlp_project_dataset\train.csv")

In [3]:
dataset

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
df = dataset[['text','target']]

In [5]:
df

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [6]:
df["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
yes = df[df["target"] == 1]
no = df[df["target"] == 0]

In [8]:
yes.shape, no.shape

((3271, 2), (4342, 2))

In [9]:
no = no.sample(yes.shape[0])

In [10]:
yes.shape, no.shape

((3271, 2), (3271, 2))

In [11]:
dataset = no.append(yes, ignore_index = True)

  dataset = no.append(yes, ignore_index = True)


In [12]:
dataset

Unnamed: 0,text,target
0,INFANTRY Mens Lume Dial Army Analog Quartz Wri...,0
1,I PUT MY CHICKEN NUGGETS IN THE MICROWAVE FOR ...,0
2,@RockBottomRadFM As a kid I remember hearing r...,0
3,@phiddleface NOT IF THERES A CHOKING HAZARD!!!...,0
4,No way...I can't eat that shit,0
...,...,...
6537,Two giant cranes holding a bridge collapse int...,1
6538,@aria_ahrary @TheTawniest The out of control w...,1
6539,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
6540,Police investigating after an e-bike collided ...,1


In [13]:
dataset.shape

(6542, 2)

In [14]:
dataset['target'].value_counts()

0    3271
1    3271
Name: target, dtype: int64

In [15]:
lemma = WordNetLemmatizer()

#### Text Preprocessing

In [16]:
def text_preprocessing(text):
    re_punt = "[^A-Za-z\s]"
    text = re.sub(re_punt, "",text) #Removal of unwanted punctuations
    text = text.lower() #Normalising the case
    text = nlp(text) 
    
    tok_text = [] # Tokenization
    for token in text:
        tok_text.append(token.text)
        
    sw_list = stopwords.words("english") #Removal of stop words
    tok_text = [word for word in tok_text if word not in sw_list]
    
    
    lem_text = ' ' #Lemmatization
    for word in tok_text:
        lem_text += ' ' + lemma.lemmatize(word, pos='v')
    
    return lem_text


In [17]:
dataset["Text1"] = dataset["text"].apply(text_preprocessing)

In [18]:
dataset

Unnamed: 0,text,target,Text1
0,INFANTRY Mens Lume Dial Army Analog Quartz Wri...,0,infantry mens lume dial army analog quartz w...
1,I PUT MY CHICKEN NUGGETS IN THE MICROWAVE FOR ...,0,put chicken nuggets microwave minutes inte...
2,@RockBottomRadFM As a kid I remember hearing r...,0,rockbottomradfm kid remember hear rule d...
3,@phiddleface NOT IF THERES A CHOKING HAZARD!!!...,0,phiddleface choke hazard nt die get
4,No way...I can't eat that shit,0,wayi ca nt eat shit
...,...,...,...
6537,Two giant cranes holding a bridge collapse int...,1,two giant crane hold bridge collapse nearby ...
6538,@aria_ahrary @TheTawniest The out of control w...,1,ariaahrary thetawniest control wild fire cal...
6539,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utckm volcano hawaii httptcozdtoydebj
6540,Police investigating after an e-bike collided ...,1,police investigate ebike collide car little ...


# <font size = 3> Vectorization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [20]:
X = tfidf.fit_transform(dataset["Text1"]).toarray()

In [21]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
X = pd.DataFrame(X, columns = tfidf.get_feature_names_out())

In [23]:
X

Unnamed: 0,aa,aaaaaaallll,aaaaaand,aaarrrgghhh,aal,aampb,aan,aannnnd,aar,aashiqui,...,zone,zonesthank,zonewolf,zotar,zouma,zourryart,zss,zurich,zxathetis,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
y = dataset['target']

In [25]:
y

0       0
1       0
2       0
3       0
4       0
       ..
6537    1
6538    1
6539    1
6540    1
6541    1
Name: target, Length: 6542, dtype: int64

#### Splitting the data into train-set and test-set

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# <font size = 3> Naive Bayes Classification

In [27]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[311 320]
 [117 561]]


0.666157372039725

# <font size = 3> Random Forest Classification

In [28]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[551  80]
 [227 451]]


0.7654698242933538

# <font size = 3> Preditions of test data

In [30]:
df2 = pd.read_csv(r"C:\Users\bhumu\Downloads\nlp_project_dataset\test.csv")

In [31]:
df2

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [32]:
# we do skip keyword, location as we didn't train model with these features
test_df = df2[['id','text']]

test_df

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,Storm in RI worse than last hurricane. My city...
3260,10868,Green Line derailment in Chicago http://t.co/U...
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...


In [34]:
test_df["Test_Text1"] = test_df["text"].apply(text_preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["Test_Text1"] = test_df["text"].apply(text_preprocessing)


In [35]:
test_df

Unnamed: 0,id,text,Test_Text1
0,0,Just happened a terrible car crash,happen terrible car crash
1,2,"Heard about #earthquake is different cities, s...",hear earthquake different cities stay safe e...
2,3,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese flee across stre...
3,9,Apocalypse lighting. #Spokane #wildfires,apocalypse light spokane wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan
...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety los angeles safety faste...
3259,10865,Storm in RI worse than last hurricane. My city...,storm ri worse last hurricane cityampothers ...
3260,10868,Green Line derailment in Chicago http://t.co/U...,green line derailment chicago httptcoutbxlcbiuy
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issue hazardous weather outlook hwo http...


In [36]:
X_1 = tfidf.transform(test_df["Test_Text1"])

y_pred_1 = rfc.predict(X_1)

y_pred_1



array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [37]:
final_sub = pd.DataFrame({'id':test_df['id'], 'target':y_pred_1})

final_sub.to_csv(r"C:\Users\bhumu\Downloads\nlp_project_dataset\fourth_submission.csv", index = False)