In [1]:
from __future__ import print_function
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.datasets import imdb

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
# pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deban\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data=pd.read_csv("train.csv",sep="~",usecols=['Description','Is_Response'])

In [4]:
from string import punctuation
from nltk.corpus import stopwords

def sentence_to_words(sentence):
    
    sentence = sentence.lower()
    
    sentence = ''.join([c for c in sentence if c not in punctuation])    
    sentence_split = sentence.split('\n')
    sentence = ' '.join(sentence_split)
    words = sentence.split()
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    return words

In [5]:
import pickle
def cleanup_data(data,cache_file="preprocessed_data.pkl"):
    cache_data=None
    if cache_file is not None:
        try:
            with open(cache_file,'rb') as f:
                cache_data=pickle.load(f)
            print("Read cache data")
        except:
            pass
        
    #Create cache if not present
    if cache_data is None:
        words_train=[sentence_to_words(sentence) for sentence in data]
        #Write to cache file
        if cache_file is not None:
            cache_data=words_train
            with open(cache_file, "wb") as f:
                pickle.dump(cache_data,f)
            print("Wrote preprocessed data to: ",cache_file)
    else:
        words_train=cache_data
    return words_train

In [6]:
data.Description=cleanup_data(data.Description)

Read cache data


In [7]:
import numpy as np
from collections import Counter

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    flattened_data = [y for x in data for y in x]
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    word_count=Counter(flattened_data)
    sorted_words = [word for word, _ in word_count.most_common()]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size-2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [8]:
word_dict=build_dict(data.Description)

In [9]:
def convert_and_pad(word_dict, sentence, pad=200):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence

def convert_and_pad_data(word_dict, data, pad=200):
    result = []
      
    for sentence in data:
        converted= convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        
        
    return result

In [10]:
data.Description=convert_and_pad_data(word_dict,data.Description)

In [11]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data.Is_Response=le.fit_transform(data.Is_Response)

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.Description, data.Is_Response, test_size=0.33, random_state=42)

In [13]:
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 256

print('Loading data...')

Loading data...


In [None]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

Pad sequences (samples x time)
x_train shape: (20215, 80)
x_test shape: (9957, 80)
Build model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [None]:

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=30,
          validation_data=(x_test, y_test))


Train...
Train on 20215 samples, validate on 9957 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
 1792/20215 [=>............................] - ETA: 8s - loss: 0.5601 - acc: 0.7349

In [None]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)