In [381]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer

# Load Data

In [449]:
path = "data/train.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Data explo

## Shape, size, info

In [6]:
df.shape

(7613, 5)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


## Balanced?

In [5]:
df['target'].value_counts(1)

0    0.57034
1    0.42966
Name: target, dtype: float64

## Nan number ?

In [9]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

Lets drop location and id columns and explore keyword column

## keyword column

In [40]:
df_copy = df.copy()

In [155]:
df_copy.drop(columns=['location', 'id'], inplace=True)

KeyError: "['location' 'id'] not found in axis"

In [46]:
df_copy.dropna(inplace=True)

In [47]:
df_copy.shape

(5080, 5)

In [51]:
df_copy['keyword'].unique()

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'derailed

For the moment i will not use the keyword column, because of the nan number, but i think it is an interesting data

# Data preprocess

## Clean data

In [None]:
# approche de base : on se concentre sur les commentaires uniquement
df.drop(columns=['location', 'id', 'keyword'], inplace=True)

In [451]:
def clean_data(text, remove_punctuations=False, lower_case=False,remove_numb=False, remove_symbol=False):
    for index, sentence in enumerate(text) : 
        if remove_punctuations == True :
            for punctuation in string.punctuation:
                sentence = sentence.replace(punctuation, '')
        if lower_case == True :
            sentence = sentence.lower()
        if remove_numb == True :
            sentence = ''.join(word for word in sentence if not word.isdigit())
        if remove_symbol == True :
            sentence = ''.join(char if char not in ['#', '@', "&", "\(", "\)", "_", "$", "£", "%", "/", "\\"] else ' ' for char in sentence)
        text[index] = sentence
    return text

In [452]:
df['text'] = clean_data(df['text'], remove_punctuations=True, lower_case=True,remove_numb=True, remove_symbol=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text[index] = sentence


In [453]:
stop_words = set(stopwords.words('english'))
for index, com in enumerate(df['text']) :
    df['text'][index] = word_tokenize(df['text'][index])
    df['text'][index] = [t for t in df['text'][index] if t not in stop_words]
    
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][index] = word_tokenize(df['text'][index])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][index] = [t for t in df['text'][index] if t not in stop_words]


Unnamed: 0,text,target
0,"[deeds, reason, earthquake, may, allah, forgiv...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[residents, asked, shelter, place, notified, o...",1
3,"[people, receive, wildfires, evacuation, order...",1
4,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


In [454]:
lemmatizer = WordNetLemmatizer()
for index, com in enumerate(df['text']) :
    df['text'][index] = [lemmatizer.lemmatize(word) for word in df['text'][index]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][index] = [lemmatizer.lemmatize(word) for word in df['text'][index]]


In [455]:
df['text']

0       [deed, reason, earthquake, may, allah, forgive...
1           [forest, fire, near, la, ronge, sask, canada]
2       [resident, asked, shelter, place, notified, of...
3       [people, receive, wildfire, evacuation, order,...
4       [got, sent, photo, ruby, alaska, smoke, wildfi...
                              ...                        
7608    [two, giant, crane, holding, bridge, collapse,...
7609    [ariaahrary, thetawniest, control, wild, fire,...
7610           [utckm, volcano, hawaii, httptcozdtoydebj]
7611    [police, investigating, ebike, collided, car, ...
7612    [latest, home, razed, northern, california, wi...
Name: text, Length: 7613, dtype: object

## Vocab Size

In [456]:
list_all_words = []
for sentence in df['text'] :
    for word in sentence :
        list_all_words.append(word)

In [457]:
vocab_size = len(set(list_all_words))
vocab_size

20395

In [458]:
# je passe les listes de mots en string
for index, com in enumerate(df['text']) :
    df['text'][index] = ' '.join(df['text'][index])   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][index] = ' '.join(df['text'][index])


In [459]:
df['text']

0              deed reason earthquake may allah forgive u
1                   forest fire near la ronge sask canada
2       resident asked shelter place notified officer ...
3       people receive wildfire evacuation order calif...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609    ariaahrary thetawniest control wild fire calif...
7610                utckm volcano hawaii httptcozdtoydebj
7611    police investigating ebike collided car little...
7612    latest home razed northern california wildfire...
Name: text, Length: 7613, dtype: object

In [460]:
# je fais une copie pour une autre approche
df_copy_word2vec = df.copy()
df_copy_word2vec

Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1
3,people receive wildfire evacuation order calif...,1
4,got sent photo ruby alaska smoke wildfire pour...,1
...,...,...
7608,two giant crane holding bridge collapse nearby...,1
7609,ariaahrary thetawniest control wild fire calif...,1
7610,utckm volcano hawaii httptcozdtoydebj,1
7611,police investigating ebike collided car little...,1


In [461]:
# je transform les mots en int
tk = Tokenizer()
tk.fit_on_texts(df['text'])
df['text'] = tk.texts_to_sequences(df['text'])


In [462]:
X = df['text']
X

0                     [4018, 452, 156, 69, 1399, 4019, 6]
1                    [107, 1, 149, 504, 5984, 5985, 1067]
2       [1530, 1400, 1879, 453, 5986, 319, 162, 1879, ...
3                            [10, 4020, 76, 162, 361, 32]
4          [31, 1068, 111, 5987, 1692, 187, 76, 5988, 97]
                              ...                        
7608        [55, 692, 1047, 940, 255, 83, 563, 22, 20394]
7609    [5959, 5960, 715, 214, 1, 32, 92, 116, 294, 12...
7610                              [3755, 446, 1388, 5954]
7611    [21, 991, 2838, 259, 43, 195, 4704, 2838, 1701...
7612          [131, 22, 451, 116, 32, 76, 522, 11, 20395]
Name: text, Length: 7613, dtype: object

In [463]:
# je pad
X_pad = pad_sequences(X, dtype='float32', padding='post')
X_pad

array([[4.018e+03, 4.520e+02, 1.560e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.070e+02, 1.000e+00, 1.490e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.530e+03, 1.400e+03, 1.879e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [3.755e+03, 4.460e+02, 1.388e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.100e+01, 9.910e+02, 2.838e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.310e+02, 2.200e+01, 4.510e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00]], dtype=float32)

In [464]:
# calcul sequence max
Max_sentence_length = max([len(com) for com in X_pad])
Max_sentence_length

23

In [465]:
# spit data
X_pad_train = X_pad[:5000]
X_pad_test = X_pad[5000:]
y_train = df['target'][:5000]
y_test = df['target'][5000:]

# Model personnal embedding

In [498]:
#Embedding(input_dim=vocab_size, output_dim=emb_dim, mask_zero=True)
from tensorflow.keras import layers, Sequential
# Size of your embedding space = size to represent each word
embedding_size = 100
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size+1,
input_length=Max_sentence_length, 
output_dim=embedding_size,
mask_zero=True))
model.add(layers.LSTM(500))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 23, 100)           2039600   
_________________________________________________________________
lstm_29 (LSTM)               (None, 500)               1202000   
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 501       
Total params: 3,242,101
Trainable params: 3,242,101
Non-trainable params: 0
_________________________________________________________________


In [520]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy
es = EarlyStopping(patience=6, restore_best_weights=True)
loss=BinaryCrossentropy(from_logits=True) # default from_logits=False
#metrics=[keras.metrics.BinaryAccuracy()]
model.compile(loss=loss,
              optimizer='rmsprop',
              metrics=['accuracy', 'Recall'])
model.fit(X_pad_train, y_train, epochs=20, batch_size=16, verbose=1, callbacks=[es], validation_split=0.3)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<tensorflow.python.keras.callbacks.History at 0x1f8fb81f0>

In [521]:
model.evaluate(X_pad_test, y_test)



[0.5177033543586731, 0.7386146187782288, 0.7383177280426025]

# challenge kaggle

In [652]:
path_test = 'data/test.csv'
test_keras = pd.read_csv(path_test)
test_keras.head()


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [653]:
test_keras.tail()

Unnamed: 0,id,keyword,location,text
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...
3262,10875,,,#CityofCalgary has activated its Municipal Eme...


In [642]:
test_keras.drop(columns=['location', 'keyword'], inplace=True)

In [643]:
test_keras['text'] = clean_data(test_keras['text'], remove_punctuations=True, lower_case=True,remove_numb=True, remove_symbol=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text[index] = sentence


In [644]:
for index, com in enumerate(test_keras['text']) :
    test_keras['text'][index] = word_tokenize(test_keras['text'][index])
    test_keras['text'][index] = [t for t in test_keras['text'][index] if t not in stop_words]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_keras['text'][index] = word_tokenize(test_keras['text'][index])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_keras['text'][index] = [t for t in test_keras['text'][index] if t not in stop_words]


In [645]:
for index, com in enumerate(test_keras['text']) :
    test_keras['text'][index] = [lemmatizer.lemmatize(word) for word in test_keras['text'][index]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_keras['text'][index] = [lemmatizer.lemmatize(word) for word in test_keras['text'][index]]


In [646]:
for index, com in enumerate(test_keras['text']) :
    test_keras['text'][index] = ' '.join(test_keras['text'][index])   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_keras['text'][index] = ' '.join(test_keras['text'][index])


In [647]:
#tk.fit_on_texts(df['text'])
test_keras['text'] = tk.texts_to_sequences(test_keras['text'])

In [648]:
X_kaggle_test = test_keras['text']
X_kaggle_test_pad = pad_sequences(X_kaggle_test, dtype='float32', padding='post', maxlen=23)

In [649]:
res_test = model.predict(X_kaggle_test_pad)
len(res_test)

3263

In [650]:
res_binary = []
for num in res_test : 
    if num< 0.5:
        res_binary.append(0)
    else :
        res_binary.append(1)
len(res_binary)

3263

In [654]:
res_test = pd.DataFrame(res_binary, columns=['target'])
res_test.shape

(3263, 1)

In [656]:
res_test['id']= test_keras['id']
res_test = res_test[['id', 'target']]
res_test
#res_test[res_test['id']== 2275]

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [657]:
res_test.to_csv('challenge_kaggle.csv', index=False)

# approche avec un autre modele 


for index, word in enumerate (df_copy_word2vec['text']) :
    df_copy_word2vec['text'][index] = df_copy_word2vec['text'][index].split()
X_word2vec = df_copy_word2vec['text']  
X_word2vec

In [379]:
'''from gensim.models import Word2Vec

# This line trains an entire embedding for the words in your train set
word2vec = Word2Vec(sentences=X_word2vec, vector_size=100)
word2vec'''

'from gensim.models import Word2Vec\n\n# This line trains an entire embedding for the words in your train set\nword2vec = Word2Vec(sentences=X_word2vec, vector_size=100)\nword2vec'

In [479]:
X_train_2 = df_copy_word2vec['text'][:5000]
X_test_2 = df_copy_word2vec['text'][5000:]
y_train_2 = df_copy_word2vec['target'][:5000]
y_test_2 = df_copy_word2vec['target'][5000:]

In [480]:
word2vec_transfer = gensim.downloader.load('glove-wiki-gigaword-50')
len(word2vec_transfer)

400000

In [481]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return np.array(embed)

# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train_2)
X_test_embed_2 = embedding(word2vec_transfer, X_test_2)

  return np.array(embed)


In [482]:
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', value=0.)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', value=0.)

In [483]:
X_train_pad_2.shape

(5000, 117, 50)

In [485]:
model = Sequential()
model.add(layers.Masking(mask_value=0., input_shape=(117,50)))
model.add(layers.LSTM(80))
model.add(layers.Dense(60, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy', 'Recall'])
es = EarlyStopping(patience=4, restore_best_weights=True)
model.fit(X_train_pad_2, y_train_2, epochs=100, batch_size=16, verbose=1, callbacks=[es], validation_split=0.3)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<tensorflow.python.keras.callbacks.History at 0x1f1018a60>

In [486]:
model.evaluate(X_test_pad_2, y_test_2)



[0.654558002948761, 0.6218905448913574, 0.39507222175598145]