# This script is trying to predict the top 5 categories is in GEAR

Imports for libraries and data

In [34]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

input_file = "service_now_sample.csv"
# can do the yes to 1 in read_csv
df = pd.read_csv(input_file, header = 0)

Only need `short_description`, `U_Category_Match`,  `u_category_gear` to get the categories

In [37]:
gear_df = df[['short_description', 'u_category_gear']]
# remove nulls
gear_df = gear_df.replace(np.nan, '', regex=True)

gear_df['eOffer/eMod'] = np.where(gear_df['u_category_gear']=='eOffer/eMod - Electronic Offers/Electronic Modifications', 1, 0)
gear_df['VCSS'] = np.where(gear_df['u_category_gear']=='VCSS - Vendor Customer Self Service', 1, 0)
gear_df['AnyConnectWindows'] = np.where(gear_df['u_category_gear']=='Cisco AnyConnect Windows Client 3.1', 1, 0)
gear_df['EASi'] = np.where(gear_df['u_category_gear']=='EASi - Electronic Acquisition System Integration', 1, 0)
gear_df['google_email'] = np.where(gear_df['u_category_gear']=='Google Email', 1, 0)


In [41]:
# vectorize discription
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(gear_df['short_description'])
X_train_counts.shape

(83933, 21967)

In [42]:
# shape data
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


(83933, 21967)

In [43]:
# The most frequently appearing apps in the data
top_apps = ['eOffer/eMod', 'VCSS', 'AnyConnectWindows', 'EASi', 'google_email']

for app in top_apps:
    formated_category = gear_df[[app]]
    text_clf = MultinomialNB().fit(X_train_counts,formated_category.values.ravel())
    predicted = text_clf.predict(X_train_counts)
    print(app, np.mean(predicted == formated_category.values.ravel()))
    gear_df[app] = predicted




eOffer/eMod 0.983784685403834
VCSS 0.9886099627083507
AnyConnectWindows 0.9766003836393313
EASi 0.9895273611094564
google_email 0.979173864868407


In [44]:
gear_df.to_csv('category_predictions.csv')