
github url:  https://github.com/Bessonica/NNprojects

# The problem:
  we must build NN that would understand what tweets are about real disaster and what is not.

# Data:
 * train.csv
 * test.csv
 * sample_submission.csv
 
# Each sample of data consists of:
* text of tweet
* Key word for tweet
* location of tweet
 
# Columns:
 * id - a unique identifier for each tweet
 * text - the text of the tweet
 * location - the location the tweet was sent from (may be blank)
 * keyword - a particular keyword from the tweet (may be blank)
 * target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)


# Import all needed libraries

**matplotlib, seaborn for graph**

In [69]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

# seaborn =  data visualization library
import seaborn as sns
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**NN packages**

In [70]:


import nltk 
from nltk.corpus import stopwords

import re 
from sklearn.model_selection import train_test_split


from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, Embedding,GRU, LSTM, RNN
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K


# EDA
# Inspect, visualize, clean



In [71]:

train_data=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_data.head()

test_data=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_data.head()

# Visualize (What data we have?)

how much tweets real disaster, and how much is not



In [72]:
#values  0 = not disaster, 1 = real disaster


#count data based on "target" column
sns.countplot(train_data['target'])

plt.title('No disaster:'+str(train_data.target.value_counts()[0])+'\n'+
         'Real disaster:'+str(train_data.target.value_counts()[1]))
plt.show()

length of words in tweets

In [73]:
#take every tweet and split them based on space
#and count their length

def words_len(arr, text):
    word_len = []
    for i in arr:
        word_len.append(len(i.split(' ')))

    plt.figure(figsize=(12,6))
    sns.countplot(word_len)
    plt.xlabel("Lengths of words:")
    plt.ylabel('How many:')
    plt.title(text)
    
    plt.show()
    
words_len(train_data['text'],"length of words in train data set")

# Clean data   
In our implementation of NN we dont need keyword and location column

In [74]:
train_data.drop(['keyword','location'], axis=1, inplace=True)
test_data.drop(['keyword','location'], axis=1, inplace=True)

# Clean data     

lets think about what data is useless
* short words
* special symbols, such as    * http =   
* stopwords
* numbers
* non English letters

Then normalize data
* make all lower case



In [75]:
#we use nltk to get stopwords
#stopwords are category of words that search engine ignore, because they useless
#we can take this data to our advantage

# swords=set(stopwords.words('english'))

stopWords=set(stopwords.words('english'))

def clear_txt(text):
    #to lower case
    h_str = text.lower()
    
    h_str = re.sub(r'(http|https)?\/\/(\w|\.|\/|\?|\=|\&|\%)*\b','',h_str)
    
    h_str = re.sub(r'\{[^)]*\}', '', h_str)
    h_str = re.sub(r'\([^)]*\)', '', h_str)
    
    h_str = re.sub('[^a-zA-Z]', ' ', h_str)
    
    tokens = [w for w in h_str.split() if not w in stopWords] 
    
    res = []
    
    #take out all words shorter than 4
    for i in tokens:
        if len(i) >=4:
            res.append(i)
    return (" ".join(res)).strip()

In [76]:
clean_trainData = []
for i in train_data['text']:
    clean_trainData.append(clear_txt(i))

**Lets look on some example of our work**

In [77]:
for i in range (3):
    print("Before: ", train_data['text'][i])
    print("After: ", clean_trainData[i])
    print()

# print("Before: ", train_data['text'][7])
# print("After: ", clean_trainData[7])

In [78]:
clean_testData = []
for i in test_data['text']:
    clean_testData.append(clear_txt(i))

# Visualise Result

In data we left only important to us words. And and their len distribute like this:

In [79]:
words_len(clean_trainData,"Train data set")
words_len(clean_testData,"Test data set")

# Prepare data 

Divade data into train and validation set (4:1).

In [80]:
#train_test_split  splits array into random train and validation subsets 
X_train,X_valid,y_train,y_valid = train_test_split(clean_trainData, train_data['target'], test_size = 0.2, random_state = 40)


print("Check data, did we do it right?")
print("It should be same size")
print(f"Train size: {len(X_train)}, {len(y_train)}")
print(f"Validation size: {len(X_valid)}, {len(y_valid)}")

# Time to tokenize tweets and create vocablurary
tokenize - separate text into words 

In [81]:
#tokenizer fuction is gonna help tokenize text

#plot above has shown that max length is 20


max_len = 20

tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train=tokenizer.texts_to_sequences(X_train)
X_valid=tokenizer.texts_to_sequences(X_valid)

X_test=tokenizer.texts_to_sequences(clean_testData)



# ensures they are same length
X_train=pad_sequences(X_train,maxlen=max_len,padding='post')
X_valid=pad_sequences(X_valid,maxlen=max_len,padding='post')

X_test=pad_sequences(X_test,maxlen=max_len,padding='post')





vocabluary = len(tokenizer.word_index) + 1


print("Vocabluary size ")
print(vocabluary)

# categorizing data

In [82]:


y_train=to_categorical(y_train,num_classes=2)
y_valid=to_categorical(y_valid,num_classes=2)

    
    
print("data shape")
print(y_train.shape)
print(y_valid.shape)

# Model architecture

after preparing data we can start designing our model architecture

We gonna create **multiLayer LSTM**



Lets add 4 LSTM layer with size 350,150,50.
Butch size now 200. We have greater losses, but the quality of education must increase.

Other hyper parametrs are same.

In [83]:


model=Sequential()
model.add(Embedding(vocabluary,100,input_length=max_len,trainable=True,mask_zero=True))
model.add(LSTM(350,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model.add(LSTM(150,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model.add(LSTM(50,dropout=0.1,recurrent_dropout=0.2))
model.add(Dense(64,activation='relu'))
model.add(Dense(2,activation='sigmoid'))
model.summary()

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])


In [84]:
h=model.fit(x=np.array(X_train),y=np.array(y_train),batch_size=200,epochs=30,
          validation_data=(np.array(X_valid),np.array(y_valid)))



# Lets visualize result


accuracy is 0.9
validation loss is around 1.2



In [85]:
plt.plot(h.history['val_loss'],'r',label='val_loss')
plt.plot(h.history['loss'],'g',label='train_loss')
plt.legend()

In [86]:
plt.plot(h.history['val_acc'],'b',label='val_acc')
plt.plot(h.history['acc'],'y',label='train_acc')
plt.legend()

# Final submition


In [87]:
predict_help = model.predict(X_test)
prediction = [0 if i[0]>=0.5 else 1 for i in predict_help]
test_data['target'] = prediction

sub = test_data[['id', 'target']]
print("Starting to write csv")
sub.to_csv('Submission.csv', index=False)
print("Ended writing csv")