<a href="https://colab.research.google.com/github/EsdrasGrau/NLP-with-Disaster-Tweets/blob/master/NLP_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#[Real or Not? NLP with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started)
###Predict which Tweets are about real disasters and which ones are not

###Setup

In [1]:
import pandas as pd
import regex as re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
from google.colab import files
uploaded = files.upload()


Saving sample_submission.csv to sample_submission (1).csv
Saving test.csv to test (1).csv
Saving train.csv to train (1).csv


In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
english_vocab = set(w.lower()for w in nltk.corpus.words.words())

#Data Cleaning

In [5]:
# Function to clean up the text
def clean_up(s):
  s=re.sub(r'http\S+','',s)
  s=re.sub('\W', ' ',s)
  s=re.sub('\d', ' ',s)
  s=re.sub("\s+"," ",s)
  s=s.lower().strip()
  s=" ".join([i for i in s.split()if i in english_vocab])
  return s

In [6]:
# Function to tokenize and lemmatize
def tokenize_lemmatize(s):
  s=clean_up(s)
  s=word_tokenize(s)
  lst = []
  for token in s:
    lst.append(WordNetLemmatizer().lemmatize(token))
  return " ".join(lst)

In [7]:
# Applying fuctions to datasets
train.text = train.text.apply(lambda x: tokenize_lemmatize(x))
test.text = test.text.apply(lambda x: tokenize_lemmatize(x))

#Vectorizing

In [8]:
# To vectorize and remove words in the dataset
vectorizertfidf = TfidfVectorizer(stop_words='english')

In [9]:
X = train['text']
y = train['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.01, random_state=42)

In [11]:
# BOW ready to process with models
X_train_bow = vectorizertfidf.fit_transform(X_train)
X_test_bow = vectorizertfidf.transform(X_test)

In [12]:
# Confirming the size of the train and test
print(X_train_bow.shape)
print(X_test_bow.shape)

(7536, 6698)
(77, 6698)




---


# Exploring ML models

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

##GBC

In [14]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train_bow, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [15]:
# Model accuracy
accuracy_test_gbc = gbc.score(X_test_bow, y_test)
accuracy_train_gbc = gbc.score(X_train_bow, y_train)
print(accuracy_test_gbc)
print(accuracy_train_gbc)
predictions = gbc.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.7792207792207793
0.7558386411889597
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        48
           1       0.88      0.48      0.62        29

    accuracy                           0.78        77
   macro avg       0.81      0.72      0.73        77
weighted avg       0.80      0.78      0.76        77



## Multinomial NB

In [16]:
mnb = MultinomialNB()
mnb.fit(X_train_bow, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# Model accuracy
accuracy_test_mnb = mnb.score(X_test_bow, y_test)
accuracy_train_mnb = mnb.score(X_train_bow, y_train)
print(accuracy_test_mnb)
print(accuracy_train_mnb)
predictions = mnb.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.8051948051948052
0.8631900212314225
              precision    recall  f1-score   support

           0       0.80      0.92      0.85        48
           1       0.82      0.62      0.71        29

    accuracy                           0.81        77
   macro avg       0.81      0.77      0.78        77
weighted avg       0.81      0.81      0.80        77



In [18]:
# Testing model with sample
#review = 'We  a fire in the middle of the forest for the bbq'
#prediction = mnb.predict(vectorizer.transform([review])[0])
#print('The sentimient predicted by the classifier is %i' % (prediction))

##XGBC

In [19]:
xgbc = XGBClassifier()
xgbc.fit(X_train_bow, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [20]:
# Model accuracy
accuracy_test_xgbc = xgbc.score(X_test_bow, y_test)
accuracy_train_xgbc = xgbc.score(X_train_bow, y_train)
print(accuracy_test_xgbc)
print(accuracy_train_xgbc)
predictions = xgbc.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.7662337662337663
0.7384554140127388
              precision    recall  f1-score   support

           0       0.75      0.94      0.83        48
           1       0.82      0.48      0.61        29

    accuracy                           0.77        77
   macro avg       0.79      0.71      0.72        77
weighted avg       0.78      0.77      0.75        77



##RFC

In [21]:
# Initializing RFC model
rfc = RandomForestClassifier()
rfc.fit(X_train_bow, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
# Model accuracy
accuracy_test_rfc= rfc.score(X_test_bow, y_test)
accuracy_train_rfc= rfc.score(X_train_bow, y_train)
print(accuracy_test_rfc)
print(accuracy_train_rfc)
predictions = rfc.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.7922077922077922
0.9783704883227177
              precision    recall  f1-score   support

           0       0.82      0.85      0.84        48
           1       0.74      0.69      0.71        29

    accuracy                           0.79        77
   macro avg       0.78      0.77      0.78        77
weighted avg       0.79      0.79      0.79        77



## SVC

In [23]:
# Initializing model
svc = SVC()
svc.fit(X_train_bow, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
# Model accuracy
accuracy_test_svc = svc.score(X_test_bow, y_test)
accuracy_train_svc = svc.score(X_train_bow, y_train)
print(accuracy_test_svc)
print(accuracy_train_svc)
predictions = svc.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.8051948051948052
0.9506369426751592
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        48
           1       0.85      0.59      0.69        29

    accuracy                           0.81        77
   macro avg       0.82      0.76      0.78        77
weighted avg       0.81      0.81      0.80        77



## Logisctic Regression

In [25]:
# Initializing model
lgr = LogisticRegression()
lgr.fit(X_train_bow,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
# Model accuracy
accuracy_test_lgr = lgr.score(X_test_bow, y_test)
accuracy_train_lgr = lgr.score(X_train_bow, y_train)
print(accuracy_test_lgr)
print(accuracy_train_lgr)
predictions = lgr.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.7662337662337663
0.8577494692144374
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        48
           1       0.74      0.59      0.65        29

    accuracy                           0.77        77
   macro avg       0.76      0.73      0.74        77
weighted avg       0.76      0.77      0.76        77



## SVC with Grid Searching

In [27]:
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train_bow, y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.611, total=   3.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.613, total=   3.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.6s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.621, total=   3.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.612, total=   3.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.622, total=   3.8s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.570, total=   3.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.569, total=   3.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.570, total=   3.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.570, total=   3.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:  7.8min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [28]:
# Model accuracy
accuracy_test_grid = grid.score(X_test_bow, y_test)
accuracy_train_grid = grid.score(X_train_bow, y_train)
print(accuracy_test_grid)
print(accuracy_train_grid)
predictions = grid.predict(X_test_bow) 
print(classification_report(y_test, predictions))

0.8051948051948052
0.9497080679405521
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        48
           1       0.85      0.59      0.69        29

    accuracy                           0.81        77
   macro avg       0.82      0.76      0.78        77
weighted avg       0.81      0.81      0.80        77



#Models Summary

In [29]:
models = {"Model": {0:"Multinomial NB", 1:"RFC", 2:"SVC", 3:"Grid SVC",
                    4:"Linear Regression", 5:"XGBC", 6:"GBC"}, 
          "Score": {0:accuracy_test_mnb, 1:accuracy_test_rfc, 2:accuracy_test_svc, 
                    3:accuracy_test_grid, 4:accuracy_test_lgr, 5:accuracy_test_xgbc, 6:accuracy_test_gbc}}


In [30]:
models = pd.DataFrame(models)

In [31]:
models = models.sort_values(by=['Score'], ascending=False)

In [32]:
models

Unnamed: 0,Model,Score
0,Multinomial NB,0.805195
2,SVC,0.805195
3,Grid SVC,0.805195
1,RFC,0.792208
6,GBC,0.779221
4,Linear Regression,0.766234
5,XGBC,0.766234


##Best Model Submission

In [35]:
# Applying model to test file
test['sentiment'] = test['text'].apply(lambda x : svc.predict(vectorizertfidf.transform([x])))

In [36]:
def number(x):
    a=0
    for i in x:
        a+=i
    return a 

In [37]:
# Preparing file for submission
test['target']=test['sentiment'].apply(lambda x: number(x))

In [38]:
submission = test[['id','target']]

In [40]:
submission.to_csv('disaster_svc.csv', index = False)

#Failed models

## H2O

In [None]:
# This model was tested but the resutls were quite low compared with the previous ones.

#pip install h2o

In [None]:
#import h2o
#from h2o.automl import H2OAutoML

In [None]:
#h2o.init()

In [None]:
#trainH2o= pd.read_csv("train.csv")
#testH2o= pd.read_csv("test.csv")

In [None]:
#def clean_up(s):
#  s=re.sub(r'http\S+','',s)
#  s=re.sub('\W', ' ',s)
#  s=re.sub('\d', ' ',s)
#  s=re.sub("\s+"," ",s)
#  s=s.lower().strip()
#  return s

In [None]:
#trainH2o.text = trainH2o.text.apply(lambda x: clean_up(x))

In [None]:
#testH2o.text = testH2o.text.apply(lambda x: clean_up(x))

In [None]:
#trainH2o.to_csv('trainH2o.csv', index = False)

In [None]:
#testH2o.to_csv('testH2o.csv', index = False)

In [None]:
# parsear datos a h2o
#train=h2o.import_file('trainH2o.csv')
#test=h2o.import_file('testH2o.csv')

In [None]:
#X=train.columns
#y='target'
#X.remove(y)

In [None]:
# factor para clasificacion binaria

#train[y]=train[y].asfactor()
#test[y]=test[y].asfactor()

In [None]:
#aml=H2OAutoML(max_models=10, seed=1)
#aml.train(x=X, y=y, training_frame=train)


In [None]:
#lb=aml.leaderboard

#lb.head(rows=lb.nrows)

In [None]:
#aml.leader  # mejor modelo

In [None]:
#pred=aml.leader.predict(test)

#pred

In [None]:
#pred_as_pandas = h2o.as_list(pred) # pandas data frame is the default


In [None]:
#pred_as_pandas

In [None]:
#pred_as_pandas.rename(columns={'predict': 'target'}, inplace=True)

In [None]:
#pred_as_pandas.rename(columns={'index': 'id'}, inplace=True)

In [None]:
#pred_as_pandas


In [None]:
#pred_as_pandas.to_csv('disaster_h2o.csv')