In [17]:
import os
import pandas as pd
import numpy as np
import string
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from sklearn.metrics import f1_score

import gensim.downloader as api

In [2]:
print(os.listdir('data'))
os.listdir()

['nlp-getting-started.zip', 'sample_submission.csv', 'test.csv', 'train.csv']


['.ipynb_checkpoints',
 'data',
 'disaster_tweets.ipynb',
 'get_dataset_from_kaggle.py']

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [8]:
train.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [9]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sqrte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Preprocessing

In [11]:
def text_preprocess(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'\d+', '', sentence)
    sentence = sentence.translate(str.maketrans("","", string.punctuation))
    sentence = word_tokenize(sentence)
    
    stop_words = set(stopwords.words('english'))
    sentence = [word for word in sentence if word not in stop_words]
    
    stemmer = PorterStemmer()
    sentence = [stemmer.stem(word) for word in sentence]
    
    # lemmatizer = WordNetLemmatizer()
    # sentence = [lemmatizer.lemmatize(word) for word in sentence]
    return ' '.join(sentence)
    

train_prep = train.copy()
train_prep['text'] = train_prep['text'].apply(text_preprocess)
train_prep.replace(np.nan, 'notspecified', inplace=True)

In [12]:
train_prep.text

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    two giant crane hold bridg collaps nearbi home...
7609    ariaahrari thetawniest control wild fire calif...
7610                utckm volcano hawaii httptcozdtoydebj
7611    polic investig ebik collid car littl portug eb...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object

In [13]:
n_features = 1000
vectorizer = CountVectorizer(max_features=n_features)
X = vectorizer.fit_transform(train_prep.text).toarray()

In [14]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(train_prep.text).toarray()

In [15]:
X = np.hstack([X, pd.get_dummies(train_prep[['keyword', 'location']])])
y = train_prep.target

In [None]:
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [None]:
model = api.load('fasttext-wiki-news-subwords-300')

In [48]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
        
train_data = list(create_tagged_document(train_prep))

# Init the Doc2Vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
# Build the Volabulary
model.build_vocab(train_data)
# Train the Doc2Vec model
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [49]:
def sent2vec(sentence):
    sentence = sentence.split(' ')
    vec = model.infer_vector(sentence)
    return vec

t = train_prep['text'].apply(sent2vec)

In [50]:
t

0       [0.008554739, 0.008918406, 0.00113351, 0.00547...
1       [-0.007037535, -0.0066475165, -0.005103788, 0....
2       [-0.0034955828, -0.006101159, -0.00051764265, ...
3       [0.005369737, -8.416798e-05, 0.0019173216, 0.0...
4       [0.0018455514, 0.0069972407, -0.0051825983, -0...
                              ...                        
7608    [-0.009516171, 0.008940686, 0.003917992, -0.00...
7609    [0.0036270337, -0.004818424, 0.0013614105, -0....
7610    [0.008374924, -0.0030101107, 0.0048486115, -0....
7611    [-0.0084608635, -0.003890377, 0.0019195833, 0....
7612    [-0.0030570908, -0.009407557, 0.0037999274, 0....
Name: text, Length: 7613, dtype: object

In [51]:
x_train, x_test, y_train, y_test = train_test_split(t, y, test_size=0.2, random_state=42)

model = XGBClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)

  return f(**kwargs)


IndexError: tuple index out of range

In [46]:
x_train

array([[array([-9.2963278e-03,  7.0536546e-03, -1.9054184e-03, -3.5246655e-03,
        3.5808985e-03,  1.8847714e-03,  5.6802719e-03, -2.8058714e-03,
        8.7723834e-03, -4.9404348e-03, -1.7816613e-03, -3.5676758e-03,
       -7.6963012e-03,  2.8734813e-03, -8.9858808e-03,  7.9818284e-03,
       -6.2274914e-03,  8.1235380e-04,  6.9958814e-03, -4.1256733e-03,
       -5.9249122e-03, -9.5416950e-03, -2.3893090e-03,  9.0189949e-03,
       -8.0283983e-03, -8.9227819e-05, -2.7838359e-03,  8.6536100e-03,
        9.9012395e-03,  3.2648473e-05, -5.3528822e-03, -7.9186969e-03,
       -9.2713432e-03,  2.9375460e-03,  3.3426567e-03,  3.1763371e-03,
       -2.0706092e-03, -5.6415698e-03, -2.3462554e-03, -5.6773592e-03,
       -8.8509284e-03, -1.6691546e-03,  7.0873299e-03,  5.6847911e-03,
       -9.3508046e-03, -3.3446113e-03,  5.5040186e-03, -4.4696635e-04,
        7.1080229e-03,  2.3768949e-03], dtype=float32)],
       [array([ 0.00874881,  0.00989354, -0.00822412, -0.00523411,  0.00139255,
   

In [114]:
f1_score(pred, y_test)

0.6217008797653959

In [40]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64