In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn 
import re

In [2]:
dataset = pd.read_csv('train.csv')

In [3]:
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
dataset['text'][68]

'Accident on I-24 W #NashvilleTraffic. Traffic moving 8m slower than usual. https://t.co/0GHk693EgJ'

#### Checking a missing values

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


#### Droping duplicates

In [6]:
np.sum(dataset.duplicated())

0

In [7]:
dataset = dataset.drop_duplicates()

In [8]:
np.sum(dataset.duplicated())

0

#### Droping the id

In [9]:
dataset = dataset.drop(['id'], axis=1)

In [10]:
dataset.head()

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,1
1,,,Forest fire near La Ronge Sask. Canada,1
2,,,All residents asked to 'shelter in place' are ...,1
3,,,"13,000 people receive #wildfires evacuation or...",1
4,,,Just got sent this photo from Ruby #Alaska as ...,1


#### Function for text normalization

In [11]:
dataset['text'][68]

'Accident on I-24 W #NashvilleTraffic. Traffic moving 8m slower than usual. https://t.co/0GHk693EgJ'

In [12]:
dataset['text'][31]

'@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C'

In [13]:
dataset['text'][9]

"I'm afraid that the tornado is coming to our area..."

In [14]:
dataset['text'] = dataset['text'].apply(lambda x:x.lower())

In [15]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [16]:
import re
def text_normalize(text):
    
    text = str(text)
    text = re.sub(r'http\S+', '', text) #remove urls
    text = re.sub('#[^\s]+','',text) #remove hashtags
    text = re.sub('@[^\s]+','',text) #remove tags
    text = re.sub(r'[0-9]+','', text) #remove numbers
    text = re.sub('\s*\\b([a-z]|[a-z]{2})\\b', '', text) #remove single letters
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    res=[]
    for token in doc:
        if(token.is_stop or token.is_space): #remove stop_words
            pass
        else:
            res.append(token.lemma_)#lemmatization
    return " ".join(res)

In [17]:
dataset["text"] = dataset["text"].apply(text_normalize)

In [18]:
dataset['text'][68]

'accident traffic move slow usual'

In [19]:
dataset['text'][9]

'afraid tornado come area'

In [20]:
X = dataset.drop('target', axis =1)
y = dataset['target'].values.reshape(-1,1)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer 
vectorize = TfidfVectorizer(ngram_range=(1,2))
tf_idf_vectorizer=pd.DataFrame(vectorize.fit_transform(X['text']).toarray(),columns=vectorize.get_feature_names(),index=None)
tf_idf_vectorizer.head()



Unnamed: 0,aaaa,aaaa lemme,aaaaaaallllûªm,aaaaaaallllûªm season,aaaaaand,aaaaaand thunder,aaarrrgghhh,aal,aan,aan den,...,ûó wallybaiter,ûó want,ûóher,ûóher upper,ûókody,ûókody vine,ûû,ûû lose,ûûªs,ûûªs freakiest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
nominal_features = X[['keyword', 'location']]

In [23]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
array = one_hot_encoder.fit_transform(nominal_features)
feature_names = one_hot_encoder.get_feature_names()
encoded_nominal = pd.DataFrame(array, columns = feature_names)
encoded_nominal = pd.concat([nominal_features.reset_index(drop=True), encoded_nominal.reset_index(drop=True)], axis = 1)
encoded_nominal.drop(nominal_features, axis = 1, inplace = True)
encoded_nominal



Unnamed: 0,x0_ablaze,x0_accident,x0_aftershock,x0_airplane%20accident,x0_ambulance,x0_annihilated,x0_annihilation,x0_apocalypse,x0_armageddon,x0_army,...,"x1_ÌÏT: 43.631838,-79.55807","x1_ÌÏT: 6.4682,3.18287","x1_ÌÏT: 6.488400524109015,3.352798039832285","x1_ÌøåÀå_T: 40.736324,-73.990062",x1_å_: ?? ÌÑ ? : ?,x1_å_å_Los Mina Cityã¢,x1_å¡å¡Midwest Û¢Û¢,x1_åÊ(?Û¢`?Û¢å«)??,x1_åø\_(?)_/åø,x1_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
new_X = pd.concat([encoded_nominal, tf_idf_vectorizer],axis=1)

In [25]:
from sklearn.model_selection import train_test_split
new_X_train, new_X_test, y_train, y_test = train_test_split(new_X, y, test_size = 0.25, random_state = 0)

In [26]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(new_X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [27]:
y_pred = classifier.predict(new_X_test)

In [28]:
y_pred

array([0, 0, 0, ..., 1, 1, 0])