In [23]:
import pandas as pd
import numpy as np
import nltk,re
import tensorflow as tf

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from keras.models import Sequential, Model
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional, Embedding, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam 
from nltk import FreqDist

In [2]:
def read_from_file(file_name):
    X = []
    with open(file_name, encoding="utf8") as f:             #StackOverflow.txt
        for x in f:
            X.append(x)
    return X

In [3]:
train_data_X = pd.Series(read_from_file('StackOverflow.txt'))
train_data_Y = pd.Series(read_from_file('StackOverflow_gnd.txt'))

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chaitanyasudarsan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def clean_text(text):
    text = text.split()
    
    stop_words = set(stopwords.words("english"))
    text = [w for w in text if w not in stop_words]
    
    text = " ".join(text)
    
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", "  ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", "  ", text)
    text = re.sub(r"\:", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatize_words = [lemmatizer.lemmatize(word) for word in text]  
    text = " ".join(lemmatize_words)
    
    
    return text

In [6]:
train_data_X = train_data_X.apply(lambda x: clean_text(x))
train_data_Y = train_data_Y.apply(lambda x: clean_text(x))

In [7]:
print(train_data_X)
print(train_data_Y)


0        how i fill dataset datatable linq query resultset
1                                 how page collection linq
2                best subversion client window vista 64bit
3        best practice collaborative environment bin di...
4        visual studio setup project per user registry ...
5        how i elegantly express left join aggregate sq...
6                        net xml comment api documentation
7        modify address bar url ajax app match current ...
8        integrating visual studio test project cruise ...
9        what longtime window user know starting use linux
10                   folder project visual studio solution
11                                 how i create branch svn
12                 add custom tag visual studio validation
13         how i turn line number default textwrangler mac
14               how tab focus onto dropdown field mac osx
15                           how tab button osx dialog box
16                                 progressive enhanceme

In [26]:
sen_len=[]
for text in train_data_X:
    word=word_tokenize(text)
    l=len(word)
    sen_len.append(l)
    
max_sen_len=np.max(sen_len)
max_sen_len

28

In [9]:
def get_no_unique_words(texts_1):
    all_words = ' '.join(texts_1)
    all_words = word_tokenize(all_words)
    dist = FreqDist(all_words)
    num_unique_word = len(dist)
    return num_unique_word

In [10]:
num_unique_word = get_no_unique_words(train_data_X)
num_unique_word

10033

In [11]:
tokenizer = Tokenizer(num_words=10033)
tokenizer.fit_on_texts(train_data_X)

train_data_X_sequences = tokenizer.texts_to_sequences(train_data_X)

In [12]:
print(train_data_X[0:10])
print(train_data_X_sequences[0:10])

0    how i fill dataset datatable linq query resultset
1                             how page collection linq
2            best subversion client window vista 64bit
3    best practice collaborative environment bin di...
4    visual studio setup project per user registry ...
5    how i elegantly express left join aggregate sq...
6                    net xml comment api documentation
7    modify address bar url ajax app match current ...
8    integrating visual studio test project cruise ...
9    what longtime window user know starting use linux
dtype: object
[[1, 2, 1177, 969, 1178, 10, 37, 1502], [1, 30, 138, 10], [50, 63, 240, 57, 1306, 1861], [50, 279, 2570, 329, 1244, 90, 24], [18, 20, 408, 74, 449, 51, 2571, 163], [1, 2, 2979, 707, 450, 183, 1307, 38, 10, 37], [62, 79, 342, 217, 869], [560, 477, 518, 85, 15, 112, 343, 263, 351], [895, 18, 20, 188, 74, 2980, 109], [17, 4959, 57, 51, 413, 836, 27, 299]]


In [13]:
train_data_X_sequences = pad_sequences(train_data_X_sequences, maxlen = max_sen_len)

#labels = np.array(labels)

In [14]:
print(train_data_X_sequences[0:10])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    1    2 1177  969 1178   10   37 1502]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    1   30  138   10]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   50   63  240   57 1306 1861]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   50  279 2570  329 1244   90   24]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   18   20  408   74  449   51 2571  163]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    1    2 2979  707  450  183 1307   38   10   37]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0   62   79  

In [15]:
print(train_data_X_sequences.shape)
print(train_data_Y.shape)
train_data_Y = to_categorical(train_data_Y)
print(train_data_Y.shape)

(20000, 28)
(20000,)
(20000, 21)


In [16]:
model1=Sequential()
model1.add(Embedding(num_unique_word,100,mask_zero=True))
model1.add(Bidirectional(LSTM(64,dropout=0.4, recurrent_dropout=0.4),merge_mode='concat'))
#model1.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model1.add(Dense(21,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1003300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_1 (Dense)              (None, 21)                2709      
Total params: 1,090,489
Trainable params: 1,090,489
Non-trainable params: 0
_________________________________________________________________


In [17]:
batch_size = 128
epochs = 3
num_classes=21
history1=model1.fit(train_data_X_sequences, train_data_Y,epochs=epochs, batch_size=batch_size, verbose=1)



Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [18]:
target_layer = model1.layers[-2]
target_layer.return_sequences = True
target_layer.forward_layer.return_sequences = True
target_layer.backward_layer.return_sequences = True

In [20]:
outputs = target_layer.get_input_at(target_layer.input)
m = Model(model1.input, outputs)

TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.