# Importing Data

The first step is to take the data and put it into a pandas dataframe. 

In [1]:
import pandas as pd
import numpy as np
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors #For word2vec static vectors
from sklearn.preprocessing import LabelEncoder

#CONSTANTS
BASE_DIR = '../../Dataset/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
MAX_NB_WORDS = 200000
EMBEDDING_DIMENSIONS = 300
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 50
WEIGHTS_PATH = 'saved_models/weights.best.tweet_classification.hdf5'
NUM_OF_RANKS = 25

#Input Data, replace empty with NaN, and then drop these NaN fields so we don't train off non-categorized data.
df = pd.read_csv("data2.csv")
df.replace(r'^\s+$', np.nan, regex=True)
df.dropna(axis=0, how="any", subset=['Category'])
data = df.drop(columns=['Date', 'Twitter Handle', 'Link', 'Reach', 'Customer Experience', 'Sentiment', 'Type', 'Total Outbound Tweets'])
data[50:60]


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Unnamed: 0,TweetText,Category
50,@azuresupport #azTechHelp,Other - MS PG
51,@azuresupport #azhelp:,Other - MS PG
52,"@AzureSupport, trying to activate a subscripti...",Subscription
53,"@justinchronicle Hi Justin, our friends @Azure...",Other - MS PG
54,@azuresupport #azTechHelp,Other - MS PG
55,@azuresupport #azTechHelp,Other - MS PG
56,@azuresupport #AHD:PT41-JL8,Other - MS PG
57,ã²ã¨ç›®ã§åˆ†ã‹ã‚‹Azure Active Directory ç¬...,Other - MS PG
58,"@AzureSupport just a heads up, but this button...",Support
59,@AzureSupport I am getting Write DomainService...,AAD


In [2]:
from sklearn import preprocessing

le = preprocessing.LabelBinarizer()


data['Category'] = pd.Categorical(data['Category'])
data['coded'] = data['Category'].cat.codes
label = np_utils.to_categorical(data['coded'].as_matrix())

coded = dict(enumerate(data['Category'].cat.categories))

#I need a list of categories to make TreeMap in d3.js
unique_categories = [x for x in coded.values()]
cat_tokens = {}
cat_values = {}

cats = le.fit_transform(data['coded'])

In [3]:


flat = df['TweetText'].tolist()
data_1 = data['TweetText'].tolist()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(flat)
seq = tokenizer.texts_to_sequences(data_1)
print(data_1[12])
print(seq[12])

#Initialize Word2Vec as a KeyedVector as I won't be using it as an object
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
X_train, X_test, y_train, y_test = train_test_split(seq, label, test_size=0.2)
print(X_train[12])



@AzureSupport Im getting different Azure Search results in the portal vs what I get via the SDK. SDK is returning nothing when I add a filter. Any tips on debugging would be appreciated!
[1, 57, 68, 320, 3, 385, 897, 9, 5, 36, 355, 60, 4, 41, 148, 5, 556, 556, 8, 1078, 339, 46, 4, 132, 6, 2185, 28, 1115, 12, 1966, 160, 52, 1404]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    1   42   22  250   24    6 2735   75  768   79  954
  159    5 2117  335   65   24  208  187]


In [4]:
word_count = tokenizer.word_counts

In [5]:
word_index = tokenizer.word_index

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIMENSIONS))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

nb_words = min(MAX_NB_WORDS, len(word_index))+1

Null word embeddings: 12101


In [12]:
from keras.layers import Input, Dense, Conv2D, GlobalMaxPooling2D, Embedding, Reshape, Activation, MaxPooling2D
from keras.models import Model
from keras import layers
import keras


input_tweet = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIMENSIONS,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)(input_tweet)
dat = Reshape((EMBEDDING_DIMENSIONS,MAX_SEQUENCE_LENGTH,1))(embedding_layer)

tower_1 = Conv2D(256, (2, 300), padding='same', activation='relu')(dat)
pool_1 = MaxPooling2D(pool_size=(4,40))(tower_1)

tower_2 = Conv2D(256, (4, 300), padding='same', activation='relu')(dat)
pool_2 = MaxPooling2D(pool_size=(4,40))(tower_2)

tower_3 = Conv2D(256, (5, 300), padding='same', activation='relu')(dat)
pool_3 = MaxPooling2D(pool_size=(4,40))(tower_3)

cat = keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)
pool = GlobalMaxPooling2D()(cat)
dense = Dense(98)(pool)
out = Activation('softmax')(dense)
model_functional = Model(inputs=input_tweet, outputs=out)
model_functional.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'categorical_accuracy'])
model_functional.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_5 (Embedding)          (None, 50, 300)       6340200     input_5[0][0]                    
____________________________________________________________________________________________________
reshape_5 (Reshape)              (None, 300, 50, 1)    0           embedding_5[0][0]                
____________________________________________________________________________________________________
conv2d_13 (Conv2D)               (None, 300, 50, 256)  153856      reshape_5[0][0]                  
___________________________________________________________________________________________

In [13]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.tweet_classification.hdf5', 
                    verbose=1, save_best_only=True)

stopper = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=1, verbose=0, mode='auto')

#X_train = np.expand_dims(X_train, axis=2)
model_functional.fit(X_train, y_train, validation_split=.2, batch_size=50, callbacks=[checkpointer, stopper], verbose=1)

Train on 12324 samples, validate on 3081 samples
Epoch 1/1

KeyboardInterrupt: 

In [None]:
model_functional.save('saved_models/classification_model.h5')

def make_cat(pred):
    #Takes output prediction Matrix, finds 3 highest values and turns them into the matching category from the previous one-hot encoding
    ind = np.argpartition(pred[0], -3)[-3:]
    #arr = pred[0][ind]
    labels = [coded[int(x)] for x in ind[::-1]]
    return labels

tweet = ['@AzureSupport Im getting different Azure Search results in the portal vs what I get via the SDK. SDK is returning nothing when I add a filter. Any tips on debugging would be appreciated!']
pred = pad_sequences(tokenizer.texts_to_sequences(tweet), maxlen=MAX_SEQUENCE_LENGTH)
categ = model_functional.predict(np.array(pred))
print("The Top 3 Categories are {0}".format(make_cat(categ)))


    


In [None]:
#from keras.models import load_model

#trained_model = load_model('saved_models/classification_model.h5')
#trained_model.summary()
#trained_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'categorical_accuracy'])