In [49]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn.utils import shuffle

In [50]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.13.0-rc2


In [51]:
data = pd.read_csv("/home/adithya/Desktop/stack-overflow-data.csv")
data=shuffle(data)
data.head()



Unnamed: 0,post,tags
31282,stack overflow when overloading << operator <p...,c++
37974,export gridview to ms word preserving formatti...,asp.net
4379,paperclip - attaching notes to images am new ...,ruby-on-rails
9625,how can one ellicit a toast from a service to ...,android
7633,create new class on an existing winform and a...,c#


In [52]:
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 33600
Test size: 8401


In [53]:
train_posts = data['post'][:train_size]
train_tags = data['tags'][:train_size]

test_posts = data['post'][train_size:]
test_tags = data['tags'][train_size:]

In [54]:
max_words = 10000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [55]:




tokenize.fit_on_texts(train_posts) # only fit on train
x_test = tokenize.texts_to_matrix(test_posts)
x_train = tokenize.texts_to_matrix(train_posts)

print(x_test)

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [56]:
print(x_test.shape)

(8401, 10000)


In [57]:
encoder = LabelEncoder()
encoder.fit(train_tags)
test_tags = test_tags.map(lambda s: '<unknown>' if s not in encoder.classes_ else s)
encoder.classes_ = np.append(encoder.classes_, '<unknown>')
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [58]:
num_classes = np.max(y_train)+2
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [59]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (33600, 10000)
x_test shape: (8401, 10000)
y_train shape: (33600, 22)
y_test shape: (8401, 22)


In [60]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs =2

In [61]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [62]:

# model.fit trains the model
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 30240 samples, validate on 3360 samples
Epoch 1/2
Epoch 2/2


In [63]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.5380956962014675
Test accuracy: 0.8368051422662557


In [64]:
text_labels = encoder.classes_ 

for i in range(30):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_posts.iloc[i][:50], "...")
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

set locale during app startup  i ve managed to cha ...
Actual label:android
Predicted label: android

how do i vertically and horizontally centre a div  ...
Actual label:css
Predicted label: css

why for with float step dont do last iteration   w ...
Actual label:php
Predicted label: php

how do you group select_tag and text_field_tag   i ...
Actual label:ruby-on-rails
Predicted label: ruby-on-rails

re sizing array <pre><code>int oldlength = numbers ...
Actual label:c#
Predicted label: c#

sql count of related products via foreign key  we  ...
Actual label:sql
Predicted label: sql

mysql query - how to count foreign key occurence f ...
Actual label:mysql
Predicted label: mysql

refreshing a max7129 led display  i am new to pyth ...
Actual label:python
Predicted label: python

center website content no matter what the browser  ...
Actual label:css
Predicted label: css

report generation design patterns in rails   i am  ...
Actual label:ruby-on-rails
Predicted label: ruby-on-rails

pass

In [65]:

y_softmax = model.predict(x_test)

y_test_1d = []
y_pred_1d = []

for i in range(len(y_test)):
    probs = y_test[i]
    index_arr = np.nonzero(probs)
    one_hot_index = index_arr[0].item(0)
    y_test_1d.append(one_hot_index)

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [67]:
import pickle
filename = '/home/adithya/Desktop/gtokenize'
pickle.dump(tokenize, open(filename, 'wb'))

filename = '/home/adithya/Desktop/gNmodel'
pickle.dump(model, open(filename, 'wb'))

import pickle
filename = '/home/adithya/Desktop/gtext_labels'
pickle.dump(text_labels, open(filename, 'wb'))

In [66]:
text=['what is the uses of __init__ function?']
yes = tokenize.texts_to_matrix(text)
prediction = model.predict(np.array(yes))
predicted_label = text_labels[np.argmax(prediction)]
print(predicted_label)


python
