<a href="https://colab.research.google.com/github/Arimoro2020/Natural-Language-Processing-NLP-with-Disaster-Tweets/blob/main/03_Model_Building_%26_Pipeline_with_CountVectorization(N_gram)_%26_TfidfTransformer_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Libraries**

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn import metrics
from sklearn.metrics import  classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import  Pipeline
pd.set_option('display.max_colwidth', 70)

**Load Training Tweets**

In [3]:
train_tweets = pd.read_csv('/content/drive/MyDrive/PTDataScience/Project2/NLP/train.csv')
train_tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by of...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfire...,1


**Load Testing Tweets**

In [4]:
test_tweets = pd.read_csv('/content/drive/MyDrive/PTDataScience/Project2/NLP/NLP2/test.csv')
test_tweets.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the ..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


**Download WordNet & Stopwords from NLTK**

In [5]:
nltk.download('wordnet')
nltk.download('stopwords')
import math

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Functions For Feature Engineering, Column Transformation, & Data Cleaning**

In [6]:
def count_punc(message):
  count = sum([1 for char in message if char in string.punctuation])
  return round(count/(len(message) - message.count(' ')), 3) * 100

def transf_ppunc(perc):
  n_ppunc = (perc) ** (1/5)
  return n_ppunc  

def clean_txt(message):
  stopword = nltk.corpus.stopwords.words('english')
  word_net = nltk.WordNetLemmatizer()
  message = ''.join([word.lower() for word in message if word not in string.punctuation])
  token = re.split('\W+', message)
  message = ' '.join(word_net.lemmatize(word) for word in token if word not in stopword)
  return message

def transformerr (x):
    x['text_len'] = x['text'].apply(lambda c: len(c) - c.count(' '))
    x['t_len'] = x['text_len'].apply(lambda c: transf_ppunc(c))
    x['%punc'] = x['text'].apply(lambda c: count_punc(c))
    x['t_%punc'] = x['%punc'].apply(lambda c: transf_ppunc(c))
    x['x_feature'] = np.nan
    x['x_feature'] = x['t_len'] * x['t_%punc']
    x['txt_new'] = x['text'].apply(lambda c: clean_txt(c))
    return x[['t_%punc', 'txt_new']]

**Pass the training data as argument to the Transformerr Function**

In [7]:
transformerr(train_tweets)

Unnamed: 0,t_%punc,txt_new
0,1.124746,deed reason earthquake may allah forgive u
1,1.253927,forest fire near la ronge sask canada
2,1.219755,resident asked shelter place notified officer evacuation shelter p...
3,1.284735,13000 people receive wildfire evacuation order california
4,1.228660,got sent photo ruby alaska smoke wildfire pours school
...,...,...
7608,1.467242,two giant crane holding bridge collapse nearby home httptcostfmbbzfb5
7609,1.362761,ariaahrary thetawniest control wild fire california even northern ...
7610,1.801983,m194 0104 utc5km volcano hawaii httptcozdtoyd8ebj
7611,1.332447,police investigating ebike collided car little portugal ebike ride...


**Pass the testing data as argument to the Transformerr Function**

In [8]:
transformerr(test_tweets)

Unnamed: 0,t_%punc,txt_new
0,0.000000,happened terrible car crash
1,1.401131,heard earthquake different city stay safe everyone
2,1.210583,forest fire spot pond goose fleeing across street cannot save
3,1.519487,apocalypse lighting spokane wildfire
4,0.000000,typhoon soudelor kill 28 china taiwan
...,...,...
3258,0.000000,earthquake safety los angeles ûò safety fastener xrwn
3259,1.338732,storm ri worse last hurricane cityamp3others hardest hit yard look...
3260,1.584893,green line derailment chicago httptcoutbxlcbiuy
3261,1.641003,meg issue hazardous weather outlook hwo httptco3x6rbqjhn3


**Initiate CountVectorizer Instance, Function for Fitting vectorizer on training tweets, transform on training and testing features**

In [9]:
cvng = CountVectorizer(ngram_range=(1,2))
def vectorize (y=train_tweets, x=test_tweets):
    x_f_cvng = cvng.fit(y['txt_new'])
    train = pd.concat([y['t_%punc'], pd.DataFrame(x_f_cvng.transform(y['txt_new']).toarray())],axis=1)
    test = pd.concat([x['t_%punc'],  pd.DataFrame(x_f_cvng.transform(x['txt_new']).toarray())], axis=1)
    #train = x_f_cvng.transform(y['txt_new'])
    #test = x_f_cvng.transform(x['txt_new'])
    #x_f = pd.concat([y['t_%punc'], x[['t_%punc'],pd.DataFrame([train.toarray(), test.toarray()])], axis=1)
    #return x_f
    return train, test

**Fit vectorizer on training tweets, transform on training and testing features by passing them as arguments in vectorize Function**

In [10]:
X_train_tweets, X_test_tweets = vectorize(train_tweets, test_tweets)

In [11]:
X_test_tweets.head()

Unnamed: 0,t_%punc,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,73204,73205,73206,73207,73208,73209,73210,73211,73212,73213,73214,73215,73216,73217,73218,73219,73220,73221,73222,73223,73224,73225,73226,73227,73228,73229,73230,73231,73232,73233,73234,73235,73236,73237,73238,73239,73240,73241,73242,73243
0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.401131,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.210583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.519487,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
X_train_tweets.head()

Unnamed: 0,t_%punc,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,73204,73205,73206,73207,73208,73209,73210,73211,73212,73213,73214,73215,73216,73217,73218,73219,73220,73221,73222,73223,73224,73225,73226,73227,73228,73229,73230,73231,73232,73233,73234,73235,73236,73237,73238,73239,73240,73241,73242,73243
0,1.124746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.253927,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.219755,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.284735,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.22866,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
y = train_tweets['target']
y.shape

(7613,)

**Split Vectorized Training Data into Train & Test(Validation) Data**

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tweets, y, test_size=0.12, random_state=30, stratify=y)

**Create pipeline instance with TfidfTransformer & MultinomialNB Classifier**

In [15]:
pipe2 = Pipeline([
                     ('tfidf', TfidfTransformer()),
                     ('Classifier', MultinomialNB(alpha=0.7))
          
                     ])
                     

**Fit pipeline on X_train & y_train**

In [16]:
pipe2.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('Classifier',
                 MultinomialNB(alpha=0.7, class_prior=None, fit_prior=True))],
         verbose=False)

**Evaluate Model**

In [17]:
print('Training accuracy:', pipe2.score(X_train, y_train))
print('Test accuracy:', pipe2.score(X_test, y_test))

Training accuracy: 0.961636065084341
Test accuracy: 0.8107221006564551


**Evaluate Model to include precision and Recall of Test data**

In [18]:
predictions = pipe2.predict(X_test)

In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       521
           1       0.88      0.65      0.75       393

    accuracy                           0.81       914
   macro avg       0.83      0.79      0.80       914
weighted avg       0.82      0.81      0.81       914



**Make Predictions on Test Tweets Without Labels**

In [20]:
predics = pipe2.predict(X_test_tweets)
to_submit = pd.DataFrame({'id': test_tweets['id'], 'target': predics})
to_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [21]:
to_submit['target'].value_counts()

0    2257
1    1006
Name: target, dtype: int64

In [22]:
to_submit.to_csv('/content/drive/MyDrive/PTDataScience/Project2/NLP/submission26.csv', index=False)