<a href="https://colab.research.google.com/github/CoGian/NLP-with-Disaster-Tweets/blob/master/Prepare_Data_for_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import numpy as np 
import pandas as pd
import tensorflow as tf 
import random
import pickle 
import gc
import os
from sklearn import metrics
from sklearn.model_selection import train_test_split
import sys

In [0]:
MAX_LEN = 30

In [0]:
TARGET_COLUMN = 'target'
LOCATION_COLUMN = 'location'
KEYWORD_COLUMN = 'keyword'
METADADATA_COLUMNS = [LOCATION_COLUMN] + [KEYWORD_COLUMN]
TEXT = 'text'

TRAIN_DATASET_PATH = '/content/drive/My Drive/NLP with Disaster Tweets/train_cleared.csv'
TEST_DATASET_PATH = '/content/drive/My Drive/NLP with Disaster Tweets/test_cleared.csv'
GLOVE_PATH = "/content/drive/My Drive/Glove/glove.840B.300d.pkl"
CRAWL_PATH = "/content/drive/My Drive/Crawl/crawl-300d-2M.pkl"

In [0]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df =  pd.read_csv(TEST_DATASET_PATH)

# Prepare Data for training

## Split Data

In [0]:
train_df , val_df = train_test_split(train_df,test_size = 0.15,shuffle = True)

In [0]:
x_train = train_df[TEXT].astype(str)
y_train = train_df[TARGET_COLUMN].astype(int).values

x_val = val_df[TEXT].astype(str)
y_val = val_df[TARGET_COLUMN].astype(int).values

x_test = test_df[TEXT].astype(str)

In [0]:
#create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_test) + list(x_val))

In [0]:
#convert text data to numerical indexes
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)


#pad data up to MAX_LEN (note that we truncate if there are more than MAX_LEN tokens)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=MAX_LEN)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=MAX_LEN)

## Build matrix with embeddings

In [0]:
def load_embeddings(path):
    with open(path,'rb') as f:
        embedding_index = pickle.load(f)
    return embedding_index

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words 

In [32]:
glove_embedding_matrix,unknown_words = build_matrix(tokenizer.word_index,GLOVE_PATH)
print('n unknown words (glove): ', len(unknown_words))
print('n known words (glove): ', len(glove_embedding_matrix))

n unknown words (glove):  5634
n known words (glove):  27282


In [33]:
crawl_embedding_matrix,unknown_words = build_matrix(tokenizer.word_index,CRAWL_PATH)
print('n unknown words (crawl): ', len(unknown_words))
print('n known words (crawl): ', len(crawl_embedding_matrix))

n unknown words (crawl):  5689
n known words (crawl):  27282


In [34]:
embedding_matrix = np.concatenate([glove_embedding_matrix, crawl_embedding_matrix], axis=-1)
embedding_matrix.shape

(27282, 600)

In [0]:
np.save(os.path.join('/content/drive/My Drive/NLP with Disaster Tweets/', 'embedding_matrix'),embedding_matrix)

In [36]:
del crawl_embedding_matrix
del glove_embedding_matrix
gc.collect()

104

## Save sets

In [39]:
train_set = pd.DataFrame(y_train,columns=[TARGET_COLUMN])
seq = pd.DataFrame(x_train)
train_set = pd.concat([train_set,seq],axis=1)
len(train_set)

6471

In [40]:
val_set = pd.DataFrame(y_val,columns=[TARGET_COLUMN])  
seq = pd.DataFrame(x_val)
val_set = pd.concat([val_set,seq],axis=1)
len(val_set)

1142

In [41]:
test_set = pd.DataFrame(x_test)
len(test_set)

3263

In [0]:
train_set.to_pickle("/content/drive/My Drive/NLP with Disaster Tweets/train_set.pkl")
test_set.to_pickle("/content/drive/My Drive/NLP with Disaster Tweets/test_set.pkl")
val_set.to_pickle("/content/drive/My Drive/NLP with Disaster Tweets/val_set.pkl")