**1.1 Importing the required Libraries**

In [1]:
#install tweet-preprocessor to clean tweets
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/17/9d/71bd016a9edcef8860c607e531f30bd09b13103c7951ae73dd2bf174163c/tweet_preprocessor-0.6.0-py3-none-any.whl
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt        #Visualisation
import seaborn as sns                  #Visualisation
%matplotlib inline

**1.2 Reading the train data**

In [3]:
train = pd.read_csv("train_new.csv")
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


**1.3 Entire information of Train data**

In [4]:
print("-------------------------------------------------------\nChecking Null values\n-------------------------------------------------------")
print(" " ,train.isnull().sum())
print("-------------------------------------------------------\nShape of the dataset\n-------------------------------------------------------")
print(" ",train.shape)
print("-------------------------------------------------------\nInformation of dataset\n-------------------------------------------------------")
print("", train.info())
print("-------------------------------------------------------\nDescription of the dataset\n-------------------------------------------------------")
print("",train.describe())
print("-------------------------------------------------------\nChecking duplication records\n-------------------------------------------------------")
print(sum(train.duplicated()))
print("-------------------------------------------------------\nDroping the NaN values\n-------------------------------------------------------")

print(train.dropna(axis='columns'))
print(train.dropna(how='all'))
print("-------------------------------------------------------\nChecking for Null\n-------------------------------------------------------")
print(train.isnull().sum())

-------------------------------------------------------
Checking Null values
-------------------------------------------------------
  id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
-------------------------------------------------------
Shape of the dataset
-------------------------------------------------------
  (7613, 5)
-------------------------------------------------------
Information of dataset
-------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
 None
-------------------------------------------------------
D

**1.4 Reading Test data**

In [5]:
test = pd.read_csv("test_new.csv")
test.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


**1.5 Entire information of Test Data**

In [6]:
print("-------------------------------------------------------\nChecking Null values\n-------------------------------------------------------")
print(" " ,test.isnull().sum())
print("-------------------------------------------------------\nShape of the dataset\n-------------------------------------------------------")
print(" ",test.shape)
print("-------------------------------------------------------\nInformation of dataset\n-------------------------------------------------------")
print("", test.info())
print("-------------------------------------------------------\nDescription of the dataset\n-------------------------------------------------------")
print("",test.describe())
print("-------------------------------------------------------\nChecking duplication records\n-------------------------------------------------------")
print(sum(test.duplicated()))
print("-------------------------------------------------------\nDroping the NaN values\n-------------------------------------------------------")

print(test.dropna(axis='columns'))
print(test.dropna(how='all'))
print("-------------------------------------------------------\nChecking for Null\n-------------------------------------------------------")
print(test.isnull().sum())

-------------------------------------------------------
Checking Null values
-------------------------------------------------------
  id             0
keyword       26
location    1105
text           0
dtype: int64
-------------------------------------------------------
Shape of the dataset
-------------------------------------------------------
  (3263, 4)
-------------------------------------------------------
Information of dataset
-------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
 None
-------------------------------------------------------
Description of the dataset
-----------------------------

**1.6 Function for Cleaning of Text attribute**

In [7]:
import re

# helper function
def clean_text(text):
    te = str(text).encode('ascii','ignore').decode('UTF-8')
    te = re.sub(r'@[\w]+', '', te)
    te = re.sub(r'https?://t.co/[\w]+', '', te)
    te = re.sub(r'#', '', te)
    te = re.sub(r"RT @[\w]+:",'',te)
    te = re.sub(r"RT @[\w]+:",'',te)
    te = re.sub(r" RT ",'',te)
    te = re.sub(r"https://[\w]+.[\w]+/[\w]+",'',te)
    te = re.sub(r"[][]",'',te)
    te = re.sub(r"&amp","and", te)
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", te)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

**1.7 Importing the TfidfVectorizer**

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

**1.8 Converting the Train text attribute to vectors**

In [9]:
# Transform each text into a vector of word counts
vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))

training_features = vectorizer.fit_transform(train.text)
print(training_features)

  (0, 2974)	0.3649935688089075
  (0, 16347)	0.3649935688089075
  (0, 40813)	0.3649935688089075
  (0, 13634)	0.3649935688089075
  (0, 20169)	0.348985065600402
  (0, 2972)	0.3014500197590137
  (0, 16342)	0.24206629634942667
  (0, 40808)	0.27215697795073607
  (0, 13633)	0.348985065600402
  (1, 43393)	0.3334119066241046
  (1, 42558)	0.3334119066241046
  (1, 28701)	0.3334119066241046
  (1, 34463)	0.3334119066241046
  (1, 20129)	0.3334119066241046
  (1, 8607)	0.26879103060431747
  (1, 43392)	0.3334119066241046
  (1, 42557)	0.3334119066241046
  (1, 28695)	0.24532720123790422
  (1, 34441)	0.21388380779579141
  (1, 20117)	0.20898604405838725
  (2, 36424)	0.21312416533699777
  (2, 38190)	0.21312416533699777
  (2, 17624)	0.21312416533699777
  (2, 35860)	0.21312416533699777
  (2, 35352)	0.21312416533699777
  :	:
  (7611, 10865)	0.1441933550265417
  (7611, 6094)	0.388853838688965
  (7611, 35200)	0.17039013073576756
  (7611, 50135)	0.18345453703215392
  (7611, 26713)	0.16727279844425907
  (7611, 303

**1.9 Converting the Test text attribute to vectors**

In [10]:
# Transform each text into a vector of word counts
vectorizer2 = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))

testing_features = vectorizer2.fit_transform(test.text)
print(testing_features)

  (0, 4538)	0.3713765914940486
  (0, 25106)	0.4047884285864036
  (0, 11566)	0.4047884285864036
  (0, 13953)	0.38524375680717177
  (0, 6222)	0.252611247851494
  (0, 4530)	0.26414285848614355
  (0, 25105)	0.3379647544016936
  (0, 11559)	0.3109895432701939
  (0, 13902)	0.20357005630725447
  (1, 24038)	0.32050990799481033
  (1, 5260)	0.3367704205743454
  (1, 7467)	0.3367704205743454
  (1, 8262)	0.3367704205743454
  (1, 11816)	0.3367704205743454
  (1, 21857)	0.27645186555739765
  (1, 24034)	0.2684042967218669
  (1, 5257)	0.2865304181646671
  (1, 7466)	0.30897289071646783
  (1, 8256)	0.24247237572725436
  (1, 11815)	0.24247237572725436
  (2, 24281)	0.3004375220980004
  (2, 9939)	0.3004375220980004
  (2, 10615)	0.3004375220980004
  (2, 19550)	0.3004375220980004
  (2, 23812)	0.3004375220980004
  :	:
  (3260, 11190)	0.35666020273370136
  (3260, 5108)	0.3736306103604052
  (3260, 15128)	0.32480145339038613
  (3261, 16222)	0.3394866595626963
  (3261, 16221)	0.3230949969903448
  (3261, 18423)	0.311

### **2.0 Model Building**

**2.1 Importing the modules for model building**

In [11]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import precision_score, recall_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

**2.2 Spliting the train data**

In [12]:
# extract the labels from the train data
y = train.target.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.text.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)

In [13]:
#@title
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(5329,)
(5329,)
(2284,)
(2284,)


**2.3 Special Characters in RE library**

In [14]:
# remove special characters using the regular expression library
import re

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

**2.3 Preprocessing the text**

In [15]:
import preprocessor as p

# custum function to clean the dataset (combining tweet_preprocessor and reguar expression)
def clean_tweets(df):
  tempArr = []
  for line in df:
    # send to tweet_processor
    tmpL = p.clean(line)
    # remove puctuation
    tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
    tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
    tempArr.append(tmpL)
  return tempArr

**2.4 Creating seperate dataFrame for Train Data**

In [16]:
train_tweet = clean_tweets(train["text"])
train_tweet = pd.DataFrame(train_tweet)
# append cleaned tweets to the training data
train["clean_tweet"] = train_tweet
# compare the cleaned and uncleaned tweets
train.head()

Unnamed: 0,id,keyword,location,text,target,clean_tweet
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this may allah for...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive evacuation orders in california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby as smoke fr...


**2.5 Creating seperate dataFrame for Test data**

In [17]:
# clean the test data and append the cleaned tweets to the test data
test_text = clean_tweets(test["text"])
test_text = pd.DataFrame(test_text)
# append cleaned tweets to the training data
test["clean_text"] = test_text

# compare the cleaned and uncleaned tweets
test.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard about is different cities stay safe ever...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills in china and taiwan


**2.6 Spliting the data**

In [18]:
from sklearn.model_selection import train_test_split

# extract the labels from the train data
y = train.target.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)

**2.7 Importing the CountVectorizer**

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

**2.8 Optional Testing the Data**

In [20]:
documents = ["This is Import Data's Youtube channel",
             "Data science is my passion and it is fun!",
             "Please subscribe to my channel"]

# initializing the countvectorizer
vectorizer = CountVectorizer()

# tokenize and make the document into a matrix
document_term_matrix = vectorizer.fit_transform(documents)

# check the result
pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,and,channel,data,fun,import,is,it,my,passion,please,science,subscribe,this,to,youtube
0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,1
1,1,0,1,1,0,2,1,1,1,0,1,0,0,0,0
2,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0


**2.9 Fitting into CV**

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True, stop_words='english')

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

# transform documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [22]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)

# fit the SVC model based on the given training data
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)

In [23]:
print("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

Accuracy score for SVC is:  75.87565674255691 %


## **2.0 Implementing Grid SVM**

**2.1 Importing the modules**

In [34]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

**2.2 Displaying the test data**

In [28]:
test.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard about is different cities stay safe ever...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills in china and taiwan


**2.3 Displaying Train Data**

In [30]:
train.head()

Unnamed: 0,id,keyword,location,text,target,clean_tweet
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this may allah for...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive evacuation orders in california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby as smoke fr...


**2.4 Subsetting the Train Dataset**

In [31]:
data = [train['clean_tweet'], train["target"]]

headers = ["clean_text", "target"]

train_new = pd.concat(data, axis=1, keys=headers)
train_new.head()


Unnamed: 0,clean_text,target
0,our deeds are the reason of this may allah for...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,people receive evacuation orders in california,1
4,just got sent this photo from ruby as smoke fr...,1


**2.5 Spliting the dataset**

In [32]:
train, test = train_test_split(train_new, test_size=0.2, random_state=1)
X_train = train['clean_text'].values
X_test = test['clean_text'].values
y_train = train['target']
y_test = test['target']

**2.6 Cross validation and grid search**

In [35]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

**2.7 Fitting and PipeLine**

In [37]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [38]:
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1) 

grid_svm.fit(X_train, y_train)
grid_svm.score(X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.7min finished


0.853315221044216

**2.8 Calculating Accuracy, F1-Score , Precision and Recall**

In [39]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    pred = model.predict(X)        

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'Accuracy': auc, 'F1-Score': f1, 'Accuracy': acc, 'Precision': prec, 'Recall': rec}
    return result
    
report_results(grid_svm.best_estimator_, X_test, y_test)

{'Accuracy': 0.7905449770190414,
 'F1-Score': 0.733500417710944,
 'Precision': 0.789568345323741,
 'Recall': 0.6848673946957878}

**2.9 User Engagement**

In [51]:
user_input = input("Enter the string: ")
res = grid_svm.predict([user_input])
if(res[0] == 1):
  print("\n Positive \n")
else:
  print("\n Negative \n")

Enter the string: forest fire near la ronge sask canada

 Positive 

