# Data Pre-Processing

The first step is to take the data and put it into a pandas dataframe. Once the data is into Jupyter, we can begin the process of cleaning and transforming the data into the format our CNN will require. 

In [4]:
import pandas as pd
import numpy as np
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors #For word2vec static vectors
from sklearn.preprocessing import LabelEncoder

#CONSTANTS
BASE_DIR = '../../../Dataset/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
MAX_NB_WORDS = 200000
EMBEDDING_DIMENSIONS = 300
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 50
WEIGHTS_PATH = 'saved_models/weights.best.tweet_classification.hdf5'
NUM_OF_RANKS = 25
MIN_SAMPLES = 100 #Drop any category with less than 50 samples

#Input Data, replace empty with NaN, and then drop these NaN fields so we don't train off non-categorized data.
df = pd.read_csv("data2.csv")
df.replace(r'^\s+$', np.nan, regex=True)
df.dropna(axis=0, how="any", subset=['Category'])
df = df.drop(columns=['Date', 'Twitter Handle', 'Link', 'Reach', 'Customer Experience', 'Sentiment', 'Type', 'Total Outbound Tweets'])
df[50:60]


Unnamed: 0,TweetText,Category
50,@azuresupport #azTechHelp,Other - MS PG
51,@azuresupport #azhelp:,Other - MS PG
52,"@AzureSupport, trying to activate a subscripti...",Subscription
53,"@justinchronicle Hi Justin, our friends @Azure...",Other - MS PG
54,@azuresupport #azTechHelp,Other - MS PG
55,@azuresupport #azTechHelp,Other - MS PG
56,@azuresupport #AHD:PT41-JL8,Other - MS PG
57,ã²ã¨ç›®ã§åˆ†ã‹ã‚‹Azure Active Directory ç¬...,Other - MS PG
58,"@AzureSupport just a heads up, but this button...",Support
59,@AzureSupport I am getting Write DomainService...,AAD


In [5]:
from sklearn import preprocessing

le = preprocessing.LabelBinarizer()

values = df['Category'].value_counts()
tuples = [tuple((x, y)) for x, y in values.items()]

cat_to_remove = []
#Making a list of each category that has less than my defined minimum samples
for a,b in tuples:
    if b < MIN_SAMPLES:
        cat_to_remove.append(a)
    
print("Categories to Remove: {}".format(cat_to_remove))

df = df[~df.Category.isin(cat_to_remove)]

df['Category'] = pd.Categorical(df['Category'])
df['coded'] = df['Category'].cat.codes
#label = np_utils.to_categorical(data['coded'].as_matrix())


coded = dict(enumerate(df['Category'].cat.categories))

#I need a list of categories to make TreeMap in d3.js
unique_categories = [x for x in coded.values()]
num_of_cat = len(unique_categories)
print("Number of Categories: {}".format(num_of_cat))



df = df[~df.Category.isin(cat_to_remove)]
print(df[20:30])

label = np_utils.to_categorical(df['coded'].as_matrix())

cats = le.fit_transform(df['coded'])

Categories to Remove: ['Bot Framework', 'CDN', 'Azure Resources Manager', 'IOT Hub', 'VPN Gateway', 'HDInsight', 'SR Complaint', 'Container Service', 'MySQL', 'Site Recovery', 'Notification Hubs', 'Logic Apps', 'Marketplace', 'Media Services', 'Machine Learning', 'Visual Studio Application', 'Data Factory', 'Redis Cache', 'Service Fabric', 'Power BI Embedded', 'PowerShell', 'DreamSpark', 'Free Trial', 'Outage', 'Application Gateway', 'Automation', 'CosmosDB', 'Clear DB', 'Azure Stack', 'Marketing', 'Key Vault', 'Mobile Apps', 'Mobile Services', 'Azure Security Center', 'Powershell', 'Log Analytics', 'Multi-Factor Authentication', 'Imagine', 'Remote App', 'Azure Search', 'Azure DB', 'Load Balancer', 'WordPress', 'Data Lake', 'Stream Analytics', 'Traffic Manager', 'Event Hubs', 'Sign Up Issue', 'Ibizia', 'ExpressRoute', 'Azure Machine Learning', 'Azure Analysis', 'Monitoring', 'SQL Data Warehouse', 'Azure DevTest Labs', 'Compliance', 'Scheduler', 'Operational Insights', 'MSDN', 'Batch', 

In [6]:


flat = df['TweetText'].tolist()
data_1 = df['TweetText'].tolist()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(flat)
seq = tokenizer.texts_to_sequences(data_1)
print(data_1[12])
print(seq[12])

#Initialize Word2Vec as a KeyedVector as I won't be using it as an object
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
X_train, X_test, y_train, y_test = train_test_split(seq, label, test_size=0.2)



@AzureSupport Im getting different Azure Search results in the portal vs what I get via the SDK. SDK is returning nothing when I add a filter. Any tips on debugging would be appreciated!
[1, 59, 70, 317, 3, 439, 844, 9, 5, 36, 354, 60, 4, 42, 152, 5, 650, 650, 8, 1013, 339, 50, 4, 127, 6, 2006, 29, 1159, 12, 1900, 166, 54, 1450]


In [7]:
word_count = tokenizer.word_counts

In [8]:
word_index = tokenizer.word_index

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIMENSIONS))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

nb_words = min(MAX_NB_WORDS, len(word_index))+1

Null word embeddings: 10752


In [9]:
from keras.layers import Input, Dense, Conv2D, GlobalMaxPooling2D, Embedding, Reshape, Activation, MaxPooling2D, average, Dropout
from keras.models import Model
from keras import layers
from keras.metrics import top_k_categorical_accuracy
import keras

#Define Metric for matching accuracy of top 3
inTop3 = lambda x, y: top_k_categorical_accuracy(x, y, k=3)

input_tweet = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIMENSIONS,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)(input_tweet)
dat = Reshape((EMBEDDING_DIMENSIONS,MAX_SEQUENCE_LENGTH,1))(embedding_layer)

tower_1 = Conv2D(512, (300, 2), strides=1, padding='same', activation='relu')(dat)
pool_1 = MaxPooling2D(pool_size=(10,2))(tower_1)

tower_2 = Conv2D(512, (300, 4), strides=1, padding='same', activation='relu')(dat)
pool_2 = MaxPooling2D(pool_size=(10,2))(tower_2)

tower_3 = Conv2D(512, (300, 5), strides=1, padding='same', activation='relu')(dat)
pool_3 = MaxPooling2D(pool_size=(10,2))(tower_3)

tower_4 = Conv2D(512, (300,1), strides=1, padding='same', activation='relu')(dat)
pool_4 = MaxPooling2D(pool_size=(10,2))(tower_3)

#cat = keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)
cat = keras.layers.concatenate([pool_1, pool_2, pool_3, pool_4], axis=2)
conv = Conv2D(512, (1,1), strides=1, padding='same', activation='relu')(cat)
pool = GlobalMaxPooling2D()(conv)
d_1 = Dense(256)(pool)
drop = Dropout(.2)(d_1)
dense = Dense(num_of_cat)(drop)
out = Activation('softmax')(dense)
model_functional = Model(inputs=input_tweet, outputs=out)
model_functional.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'categorical_accuracy', inTop3, 'sparse_categorical_accuracy'])
model_functional.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 50)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 50, 300)       5781900     input_1[0][0]                    
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 300, 50, 1)    0           embedding_1[0][0]                
____________________________________________________________________________________________________
conv2d_1 (Conv2D)                (None, 300, 50, 512)  307712      reshape_1[0][0]                  
___________________________________________________________________________________________

In [10]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.tweet_classification_clean.hdf5', 
                    verbose=1, save_best_only=True)

tbCall = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=True, write_images=True, embeddings_freq=10, embeddings_layer_names=None, embeddings_metadata=None)
model_functional.fit(X_train, y_train, validation_split=.2, batch_size=20, epochs=20, callbacks=[checkpointer, tbCall], verbose=1)

TypeError: __init__() got an unexpected keyword argument 'embeddings_data'

In [None]:
##### model_functional.save('saved_models/classification_model.h5')

def make_cat(pred):
    #Takes output prediction Matrix, finds 3 highest values and turns them into the matching category from the previous one-hot encoding
    ind = np.argpartition(pred[0], -3)[-3:]
    #arr = pred[0][ind]
    labels = [coded[int(x)] for x in ind[::-1]]
    return labels

tweet = ['@AzureSupport Im getting different Azure Search results in the portal vs what I get via the SDK. SDK is returning nothing when I add a filter. Any tips on debugging would be appreciated!']
pred = pad_sequences(tokenizer.texts_to_sequences(tweet), maxlen=MAX_SEQUENCE_LENGTH)
categ = model_functional.predict(np.array(pred))
print("The Top 3 Categories are {0}".format(make_cat(categ)))


    


In [None]:
#from keras.models import load_model

#trained_model = load_model('saved_models/classification_model.h5')
#trained_model.summary()
#trained_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy', 'categorical_accuracy'])