# Importing Data

The first step is to take the data and put it into a pandas dataframe. We will then remove irrelevant columsn from the dataframe. 

In [10]:
import pandas as pd
import numpy as np
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors #For word2vec static vectors
from sklearn.preprocessing import LabelEncoder

#CONSTANTS
BASE_DIR = '../../Dataset/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
MAX_NB_WORDS = 200000
EMBEDDING_DIMENSIONS = 300
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 50
WEIGHTS_PATH = 'saved_models/weights.best.tweet_classification.hdf5'
NUM_OF_RANKS = 25
MIN_FREQ = 50 #Drop any category with less than 50 samples

#Input Data, replace empty with NaN, and then drop these NaN fields so we don't train off non-categorized data.
df = pd.read_csv("data2.csv")
df.replace(r'^\s+$', np.nan, regex=True)
df.dropna(axis=0, how="any", subset=['Category'])
df = df.drop(columns=['Date', 'Twitter Handle', 'Link', 'Reach', 'Customer Experience', 'Sentiment', 'Type', 'Total Outbound Tweets'])
df[50:60]


Unnamed: 0,TweetText,Category
50,@azuresupport #azTechHelp,Other - MS PG
51,@azuresupport #azhelp:,Other - MS PG
52,"@AzureSupport, trying to activate a subscripti...",Subscription
53,"@justinchronicle Hi Justin, our friends @Azure...",Other - MS PG
54,@azuresupport #azTechHelp,Other - MS PG
55,@azuresupport #azTechHelp,Other - MS PG
56,@azuresupport #AHD:PT41-JL8,Other - MS PG
57,ã²ã¨ç›®ã§åˆ†ã‹ã‚‹Azure Active Directory ç¬...,Other - MS PG
58,"@AzureSupport just a heads up, but this button...",Support
59,@AzureSupport I am getting Write DomainService...,AAD


In [16]:
from sklearn import preprocessing

le = preprocessing.LabelBinarizer()

values = df['Category'].value_counts()
tuples = [tuple((x, y)) for x, y in values.items()]

cat_to_remove = []
#Making a list of each category that has less than my defined minimum samples
for a,b in tuples:
    if b < MIN_FREQ:
        cat_to_remove.append(a)
    
print("Categories to Remove: {}".format(cat_to_remove))

df = df[~df.Category.isin(cat_to_remove)]

df['Category'] = pd.Categorical(df['Category'])
df['coded'] = df['Category'].cat.codes
#label = np_utils.to_categorical(data['coded'].as_matrix())


coded = dict(enumerate(df['Category'].cat.categories))

#I need a list of categories to make TreeMap in d3.js
unique_categories = [x for x in coded.values()]
num_of_cat = len(unique_categories)
print("Number of Categories: {}".format(num_of_cat))



df = df[~df.Category.isin(cat_to_remove)]
print(df[20:30])

label = np_utils.to_categorical(df['coded'].as_matrix())

cats = le.fit_transform(df['coded'])

Categories to Remove: []
Number of Categories: 42
                                            TweetText  \
20  @azuresupport #azTechHelp we have no connectio...   
21  @AzureSupport Hi! Migrated a Kali Linux VM fro...   
22  @azuresupport #azhelp:\nWhats happening with c...   
23  @AzureSupport @andrewwatt BTW, its been logged...   
24  @azuresupport #azTechHelp The portal is having...   
25  @AzureSupport trying to upload my local window...   
26                          @azuresupport #azTechHelp   
27  @azuresupport #azTechHelp Estou utilizando um ...   
28  @TheRegister are you aware of an Azure VM outa...   
29  .@Azure Maybe raise awareness on this? All App...   

                       Category  coded  
20                           VM     36  
21                           VM     36  
22                           VM     36  
23                      Support     35  
24                       Backup      7  
25                           VM     36  
26                Other - MS PG    

In [17]:


flat = df['TweetText'].tolist()
data_1 = df['TweetText'].tolist()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(flat)
seq = tokenizer.texts_to_sequences(data_1)
print(data_1[12])
print(seq[12])

#Initialize Word2Vec as a KeyedVector as I won't be using it as an object
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
X_train, X_test, y_train, y_test = train_test_split(seq, label, test_size=0.2)



@AzureSupport Im getting different Azure Search results in the portal vs what I get via the SDK. SDK is returning nothing when I add a filter. Any tips on debugging would be appreciated!
[1, 59, 69, 323, 3, 462, 909, 9, 5, 36, 342, 61, 4, 42, 153, 5, 600, 600, 8, 1073, 345, 48, 4, 131, 6, 2101, 28, 1074, 12, 1890, 160, 52, 1326]


In [18]:
word_count = tokenizer.word_counts

In [19]:
word_index = tokenizer.word_index

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIMENSIONS))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

nb_words = min(MAX_NB_WORDS, len(word_index))+1

Null word embeddings: 11441


In [30]:
from keras.layers import Input, Dense, Conv2D, GlobalMaxPooling2D, Embedding, Reshape, Activation, MaxPooling2D, average
from keras.models import Model
from keras import layers
from keras.metrics import top_k_categorical_accuracy
import keras

#Define Metric for matching accuracy of top 3
inTop3 = lambda x, y: top_k_categorical_accuracy(x, y, k=3)

input_tweet = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIMENSIONS,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)(input_tweet)
dat = Reshape((EMBEDDING_DIMENSIONS,MAX_SEQUENCE_LENGTH,1))(embedding_layer)

tower_1 = Conv2D(512, (300, 2), strides=1, padding='same', activation='relu')(dat)
pool_1 = MaxPooling2D(pool_size=(10,2))(tower_1)

tower_2 = Conv2D(512, (300, 4), strides=1, padding='same', activation='relu')(dat)
pool_2 = MaxPooling2D(pool_size=(10,2))(tower_2)

tower_3 = Conv2D(512, (300, 5), strides=1, padding='same', activation='relu')(dat)
pool_3 = MaxPooling2D(pool_size=(10,2))(tower_3)

#cat = keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)
cat = keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)
pool = GlobalMaxPooling2D()(cat)
d_1 = Dense(256)(pool)
dense = Dense(num_of_cat)(pool)
out = Activation('softmax')(dense)
model_functional = Model(inputs=input_tweet, outputs=out)
model_functional.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'categorical_accuracy', inTop3])
model_functional.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_9 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_9 (Embedding)          (None, 50, 300)       6071100     input_9[0][0]                    
____________________________________________________________________________________________________
reshape_9 (Reshape)              (None, 300, 50, 1)    0           embedding_9[0][0]                
____________________________________________________________________________________________________
conv2d_25 (Conv2D)               (None, 300, 50, 512)  307712      reshape_9[0][0]                  
___________________________________________________________________________________________

In [28]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.tweet_classification_clean.hdf5', 
                    verbose=1, save_best_only=True)

stopper = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=3, verbose=0, mode='auto')

#X_train = np.expand_dims(X_train, axis=2)
model_functional.fit(X_train, y_train, validation_split=.2, batch_size=25, callbacks=[checkpointer, stopper], verbose=1)

Train on 11592 samples, validate on 2898 samples
Epoch 1/1


<keras.callbacks.History at 0x1640b66f208>

In [None]:
model_functional.save('saved_models/classification_model.h5')

def make_cat(pred):
    #Takes output prediction Matrix, finds 3 highest values and turns them into the matching category from the previous one-hot encoding
    ind = np.argpartition(pred[0], -3)[-3:]
    #arr = pred[0][ind]
    labels = [coded[int(x)] for x in ind[::-1]]
    return labels

tweet = ['@AzureSupport Im getting different Azure Search results in the portal vs what I get via the SDK. SDK is returning nothing when I add a filter. Any tips on debugging would be appreciated!']
pred = pad_sequences(tokenizer.texts_to_sequences(tweet), maxlen=MAX_SEQUENCE_LENGTH)
categ = model_functional.predict(np.array(pred))
print("The Top 3 Categories are {0}".format(make_cat(categ)))


    


In [None]:
#from keras.models import load_model

#trained_model = load_model('saved_models/classification_model.h5')
#trained_model.summary()
#trained_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'categorical_accuracy'])