In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import nltk
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
from nltk import word_tokenize,sent_tokenize
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [2]:
df = pd.read_csv('train_disaster.txt', delimiter = '\t')
df_1= pd.read_csv('test_disaster.txt', delimiter = '\t')

In [3]:
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [13]:
corpus=[]
for i in range(0, 7613):
    ps = PorterStemmer()
    clean_text = re.sub('[^A-Za-z0-9 ]', '', df['text'][i]).lower()
    clean_text=clean_text.split()
    clean_text= [ps.stem(word) for word in clean_text if word not in set(stopwords.words('english'))]
    clean_text= ' '.join(clean_text)
    corpus.append(clean_text)
    
corpus_1=[]
for i in range(0, 3263):
    ps = PorterStemmer()
    clean_text = re.sub('[^A-Za-z0-9 ]', '', df_1['text'][i]).lower()
    clean_text=clean_text.split()
    clean_text= [ps.stem(word) for word in clean_text if word not in set(stopwords.words('english'))]
    clean_text= ' '.join(clean_text)
    corpus_1.append(clean_text)


In [55]:
X=corpus
y=df['target']
X_test=corpus_1

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
X_train, X_tes, y_train, y_tes = train_test_split(X, y, test_size=0.1,random_state=42)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [59]:
vectorizer=TfidfVectorizer(ngram_range=(1,3),min_df=3,strip_accents='unicode', 
                           use_idf=1,smooth_idf=1, sublinear_tf=1,max_features=None)
vectorizer.fit(list(corpus)+list(corpus_1))
print('vocab length',len(vectorizer.vocabulary_))


vocab length 10583


In [62]:
X_train = vectorizer.transform(X_train).todense()

In [63]:
X_tes = vectorizer.transform(X_tes).todense()

In [64]:
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, y_train)
MNB_pred = MNB.predict(X_tes)



In [65]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_tes, MNB_pred))
print(classification_report(y_tes,MNB_pred))


[[390  36]
 [138 198]]
              precision    recall  f1-score   support

           0       0.74      0.92      0.82       426
           1       0.85      0.59      0.69       336

    accuracy                           0.77       762
   macro avg       0.79      0.75      0.76       762
weighted avg       0.79      0.77      0.76       762



In [66]:
from sklearn.ensemble import RandomForestClassifier

In [67]:
rf = RandomForestClassifier(random_state=0,n_estimators=100,
                                max_depth=None, verbose=0,n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_tes)



In [68]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_tes,rf_pred))
print(classification_report(y_tes,rf_pred))

[[357  69]
 [113 223]]
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       426
           1       0.76      0.66      0.71       336

    accuracy                           0.76       762
   macro avg       0.76      0.75      0.75       762
weighted avg       0.76      0.76      0.76       762



In [69]:
from sklearn.linear_model import LogisticRegression




In [70]:
lr = LogisticRegression(max_iter=150,penalty='l2',solver='lbfgs',random_state=0)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_tes)


In [71]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_tes,lr_pred))
print(classification_report(y_tes,lr_pred))

[[376  50]
 [119 217]]
              precision    recall  f1-score   support

           0       0.76      0.88      0.82       426
           1       0.81      0.65      0.72       336

    accuracy                           0.78       762
   macro avg       0.79      0.76      0.77       762
weighted avg       0.78      0.78      0.77       762



In [72]:
import xgboost as xgb

In [73]:
xgb = xgb.XGBClassifier(n_estimators=100,n_jobs=-1,max_depth=15,min_child_weight=3,objective='binary:logistic',colsample_bytree=0.4)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_tes)



In [74]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_tes,xgb_pred))
print(classification_report(y_tes,xgb_pred))

[[378  48]
 [127 209]]
              precision    recall  f1-score   support

           0       0.75      0.89      0.81       426
           1       0.81      0.62      0.70       336

    accuracy                           0.77       762
   macro avg       0.78      0.75      0.76       762
weighted avg       0.78      0.77      0.76       762



In [75]:
df_1.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [78]:
X_test = corpus_1


In [79]:
X_test = vectorizer.transform(X_test).todense()


In [98]:
lr_predictions = lr.predict(X_test)
MNB_predictions = MNB.predict(X_test)
rf_predictions = rf.predict(X_test)
xgb_predictions = xgb.predict(X_test)

In [100]:
predictions = 1/4*lr_predictions+1/4*MNB_predictions+1/4*rf_predictions+1/4*xgb_predictions

In [101]:
predictions = np.where(predictions>0.5, 1, 0)

In [102]:

pred_1=pd.DataFrame(predictions)
sub_df=pd.read_csv('sample_submission_disaster.csv')
datasets=pd.concat([sub_df['id'],pred_1],axis=1)
datasets.columns=['id','target']
datasets.to_csv('submission_disaster_NLP_1.csv',index=False)