In [1]:
import pandas as pd
import numpy as np
import nltk,re

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional, Embedding, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.initializers import Constant 

from nltk import FreqDist

import gensim
import os

Using TensorFlow backend.


In [2]:
def read_from_file(file_name):
    X = []
    with open(file_name, encoding="utf8") as f:             #StackOverflow.txt
        for x in f:
            X.append(x)
    return X

In [3]:
train_data_X = pd.Series(read_from_file('StackOverflow.txt'))
train_data_Y = pd.Series(read_from_file('StackOverflow_gnd.txt'))

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chaitanyasudarsan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def clean_text(text):
    text = text.split()
    sentence_list  = list()
    stop_words = set(stopwords.words("english"))
    text = [w for w in text if w not in stop_words]
    
    text = " ".join(text)
    
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", "  ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", "  ", text)
    text = re.sub(r"\:", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatize_words = [lemmatizer.lemmatize(word) for word in text]  
    text = " ".join(lemmatize_words)
    
    sentence_list.append(text)
    return text

In [6]:
train_data_X = train_data_X.apply(lambda x: clean_text(x))
train_data_Y = train_data_Y.apply(lambda x: clean_text(x))

In [7]:
print(train_data_X)
print(train_data_Y)


0        how i fill dataset datatable linq query resultset
1                                 how page collection linq
2                best subversion client window vista 64bit
3        best practice collaborative environment bin di...
4        visual studio setup project per user registry ...
5        how i elegantly express left join aggregate sq...
6                        net xml comment api documentation
7        modify address bar url ajax app match current ...
8        integrating visual studio test project cruise ...
9        what longtime window user know starting use linux
10                   folder project visual studio solution
11                                 how i create branch svn
12                 add custom tag visual studio validation
13         how i turn line number default textwrangler mac
14               how tab focus onto dropdown field mac osx
15                           how tab button osx dialog box
16                                 progressive enhanceme

In [8]:
sen_len=[]
word_list = []
for text in train_data_X:
    word=word_tokenize(text)
    l=len(word)
    sen_len.append(l)
    word_list.append(word)
    
max_sen_len=np.max(sen_len)


In [9]:
def get_no_unique_words(texts_1):
    all_words = ' '.join(texts_1)
    all_words = word_tokenize(all_words)
    dist = FreqDist(all_words)
    num_unique_word = len(dist)
    return num_unique_word

In [10]:
num_unique_word = get_no_unique_words(train_data_X)
num_unique_word

10033

In [11]:


EMBEDDING_DIM = 32

model = gensim.models.Word2Vec(sentences = word_list, size = EMBEDDING_DIM, min_count=1,workers=4,window=5)

words = list(model.wv.vocab)

print('Vocabulary size %d' %len(words))

Vocabulary size 10033


In [12]:
print(model.wv.most_similar('exceeds'))

[('mediawiki', 0.8815551996231079), ('replacecharactersinrange', 0.8763396143913269), ('asks', 0.8749945759773254), ('transactioninterceptor', 0.8728892207145691), ('outofmemoryerror', 0.8720981478691101), ('upsert', 0.8712644577026367), ('vexing', 0.8710616827011108), ('calendarmonthview', 0.8709455132484436), ('webrick', 0.8689325451850891), ('past', 0.8661694526672363)]


In [13]:
filename = 'word_embedding.txt'
model.wv.save_word2vec_format(filename, binary= False)

In [14]:
embedding_index = {}
f = open(os.path.join('','word_embedding.txt'),encoding = 'utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:])
    embedding_index[word] = coeffs
f.close()    

In [15]:
tokenizer = Tokenizer(num_words=10033)
tokenizer.fit_on_texts(word_list)

train_data_X_sequences = tokenizer.texts_to_sequences(word_list)

word_index = tokenizer.word_index
print(word_index)



In [16]:
print(train_data_X[0:10])
print(train_data_X_sequences[0:10])

print(word_list[0:10])


0    how i fill dataset datatable linq query resultset
1                             how page collection linq
2            best subversion client window vista 64bit
3    best practice collaborative environment bin di...
4    visual studio setup project per user registry ...
5    how i elegantly express left join aggregate sq...
6                    net xml comment api documentation
7    modify address bar url ajax app match current ...
8    integrating visual studio test project cruise ...
9    what longtime window user know starting use linux
dtype: object
[[1, 2, 1179, 971, 1180, 10, 38, 1504], [1, 30, 138, 10], [51, 64, 241, 58, 1308, 1863], [51, 280, 2572, 330, 1246, 91, 24], [18, 20, 409, 75, 451, 52, 2573, 163], [1, 2, 2981, 709, 452, 182, 1309, 39, 10, 38], [63, 80, 343, 217, 871], [562, 479, 520, 86, 15, 113, 344, 264, 352], [897, 18, 20, 188, 75, 2982, 110], [17, 4962, 58, 52, 414, 838, 27, 300]]
[['how', 'i', 'fill', 'dataset', 'datatable', 'linq', 'query', 'resultset'], ['ho

In [17]:
train_data_X_sequences = pad_sequences(train_data_X_sequences, maxlen = max_sen_len)

#labels = np.array(labels)

In [18]:
print(train_data_X_sequences[0:10])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    1    2 1179  971 1180   10   38 1504]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    1   30  138   10]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   51   64  241   58 1308 1863]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   51  280 2572  330 1246   91   24]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   18   20  409   75  451   52 2573  163]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    1    2 2981  709  452  182 1309   39   10   38]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0   63   80  

In [19]:
num_words = len(word_index) + 1

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
print(num_words)

10034


In [21]:
#print(train_data_X_sequences.shape)
print(train_data_Y.shape)
train_data_Y = to_categorical(train_data_Y)
print(train_data_Y.shape)

(20000,)
(20000, 21)


In [30]:
model1=Sequential()
model1.add(Embedding(num_words,EMBEDDING_DIM,embeddings_initializer= Constant(embedding_matrix),input_length = 28,
                     mask_zero=True, trainable = False))
model1.add(LSTM(64,return_sequences=True))
model1.add(LSTM(64,return_sequences=False))
model1.add(Dense(21,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 28, 32)            321088    
_________________________________________________________________
lstm_7 (LSTM)                (None, 28, 64)            24832     
_________________________________________________________________
lstm_8 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_4 (Dense)              (None, 21)                1365      
Total params: 380,309
Trainable params: 59,221
Non-trainable params: 321,088
_________________________________________________________________


In [31]:
batch_size = 128
epochs = 3
num_classes=21
history1=model1.fit(train_data_X_sequences, train_data_Y,epochs=epochs, batch_size=batch_size, verbose=1)



Epoch 1/3
Epoch 2/3
Epoch 3/3
