In [0]:
from google.colab import files
files.upload()

# Importing Libraries

In [0]:
import collections

import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers import Embedding, CuDNNLSTM, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNGRU
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import helper

import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
import re

# Dataset

In [0]:
df = pd.read_csv('scrap_context.csv')

In [33]:
df.shape

(19227, 2)

In [34]:
df = df[(df['title'] != '[deleted]') & (df['response'] != '[deleted]')]
df.shape

(18758, 2)

# Pre-Processing 1

In [0]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example'}

In [0]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [0]:
df['title'] = df['title'].apply(lambda x: x.lower())
df['response'] = df['response'].apply(lambda x: x.lower())

In [41]:
df['title'] = df['title'].progress_apply(lambda x: clean_contractions(x, contraction_mapping))
df['response'] = df['response'].progress_apply(lambda x: clean_contractions(x, contraction_mapping))

100%|██████████| 18758/18758 [00:00<00:00, 40194.44it/s]
100%|██████████| 18758/18758 [00:00<00:00, 33356.80it/s]


In [0]:
punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

In [0]:
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

In [0]:
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [46]:
df['title'] = df['title'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df['response'] = df['response'].progress_apply(lambda x: clean_special_chars(x, punct, punct_mapping))

100%|██████████| 18758/18758 [00:02<00:00, 9289.78it/s]
100%|██████████| 18758/18758 [00:01<00:00, 10724.49it/s]


In [0]:
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

In [0]:
def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

In [49]:
df['title'] = df['title'].progress_apply(lambda x: correct_spelling(x, mispell_dict))
df['response'] = df['response'].progress_apply(lambda x: correct_spelling(x, mispell_dict))

100%|██████████| 18758/18758 [00:00<00:00, 35159.70it/s]
100%|██████████| 18758/18758 [00:00<00:00, 28474.27it/s]


In [50]:
import re
# text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
df['title'] = df['title'].progress_apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
df['response'] = df['response'].progress_apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))

100%|██████████| 18758/18758 [00:00<00:00, 82587.87it/s]
100%|██████████| 18758/18758 [00:00<00:00, 78057.06it/s]


In [51]:
check = 'hi this is /r/news yoo'
re.sub(r'\/r\/[a-z]+ ', '', check, flags=re.MULTILINE)

'hi this is yoo'

In [52]:
df['title'] = df['title'].progress_apply(lambda x: re.sub(r'\/r\/[a-z]+ ', '', x, flags=re.MULTILINE))
df['response'] = df['response'].progress_apply(lambda x: re.sub(r'\/r\/[a-z]+ ', '', x, flags=re.MULTILINE))

100%|██████████| 18758/18758 [00:00<00:00, 150831.84it/s]
100%|██████████| 18758/18758 [00:00<00:00, 136079.64it/s]


In [0]:
def preprocess_sentence(w):

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    
    return w

In [66]:
df['title'] = df['title'].progress_apply(lambda x: preprocess_sentence(x))
df['response'] = df['response'].progress_apply(lambda x: preprocess_sentence(x))

100%|██████████| 18758/18758 [00:00<00:00, 22329.07it/s]
100%|██████████| 18758/18758 [00:01<00:00, 15155.92it/s]


# Pre-Processing 2

In [0]:
# title list
titles=df['title'].values
import string
title=[]
for i in range(len(titles)):
    title.append(titles[i].lower().strip())

In [0]:
# response list
responses=df['response'].values
response=[]
for i in range(len(responses)):
    response.append(responses[i].lower().strip())

In [69]:
# view title response pair
for i in range(2):
  print('Person 1 : ', title[i])
  print('Person 2 : ', response[i])

Person 1 :  <start> it kills me to see others in worse positions than i am . you matter . you all matter and it does not matter what your depression or suicidal thoughts tell you . <end>
Person 2 :  <start> i do not know who you are , but i needed this and i thank you . i have been stuck ina runt for a couple months now . wanting to just die and let it all be over with . but after reading it , i have a different mindset . once again thank you kind stranger <end>
Person 1 :  <start> i do not know who you are , but i needed this and i thank you . i have been stuck ina runt for a couple months now . wanting to just die and let it all be over with . but after reading it , i have a different mindset . once again thank you kind stranger <end>
Person 2 :  <start> its no problem . i just know what its like . i have been in this rut for a year , and i hate how many people are in the same , if not a worse position . do not kill yourself . all the things that made you happy in the past , that can

In [70]:
title_counter = collections.Counter([word for sentence in title for word in sentence.split()])
response_counter = collections.Counter([word for sentence in response for word in sentence.split()])

print('{} Title sentences.'.format(len(title)))
print('{} Title words.'.format(len([word for sentence in title for word in sentence.split()])))
print('{} unique Title words.'.format(len(title_counter)))
print('10 Most common words in the Title:')
print('"' + '" "'.join(list(zip(*title_counter.most_common(10)))[0]) + '"')
print()
print('{} Response sentences.'.format(len(response)))
print('{} Response words.'.format(len([word for sentence in response for word in sentence.split()])))
print('{} unique Response words.'.format(len(response_counter)))
print('10 Most common words in the Response dataset:')
print('"' + '" "'.join(list(zip(*response_counter.most_common(10)))[0]) + '"')

18758 Title sentences.
965914 Title words.
20970 unique Title words.
10 Most common words in the Title:
"." "i" "," "to" "you" "the" "and" "<start>" "<end>" "a"

18758 Response sentences.
1496799 Response words.
24906 unique Response words.
10 Most common words in the Response dataset:
"." "i" "," "to" "you" "and" "the" "a" "is" "it"


In [72]:
def tokenize(x):

    tok = Tokenizer()
    tok.fit_on_texts(x)
    return tok.texts_to_sequences(x), tok


# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [74]:
def pad(x, length=None):

    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding="post")

# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


# Pre-Process Pipeline

In [0]:
def preprocess(x, y):

    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [81]:
preproc_title, preproc_response, title_tokenizer, response_tokenizer =\
    preprocess(title, response)
    
max_title_sequence_length = preproc_title.shape[1]
max_response_sequence_length = preproc_response.shape[1]
title_vocab_size = len(title_tokenizer.word_index)
response_vocab_size = len(response_tokenizer.word_index)

print('Data Preprocessed')
print("Max Title sentence length:", max_title_sequence_length)
print("Max Response sentence length:", max_response_sequence_length)
print("Title vocabulary size:", title_vocab_size)
print("Response vocabulary size:", response_vocab_size)

Data Preprocessed
Max Title sentence length: 1617
Max Response sentence length: 1617
Title vocabulary size: 20965
Response vocabulary size: 24901
