#### Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle

#### Import Data

In [2]:
qn_df = pd.read_csv('Question_Classification_Dataset.csv')
qn_df = qn_df.iloc[:,1:]
qn_df.head()

Unnamed: 0,Questions,Category0,Category1,Category2
0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner
1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal
4,What is the full form of .com ?,ABBREVIATION,ABBR,exp


### Category0 Analysis

In [3]:
qn_df1 = qn_df[['Questions', 'Category0']]
qn_df1.head()

Unnamed: 0,Questions,Category0
0,How did serfdom develop in and then leave Russ...,DESCRIPTION
1,What films featured the character Popeye Doyle ?,ENTITY
2,How can I find a list of celebrities ' real na...,DESCRIPTION
3,What fowl grabs the spotlight after the Chines...,ENTITY
4,What is the full form of .com ?,ABBREVIATION


#### Vectorization

In [4]:
qn_df1['Category Vectors'] = pd.factorize(qn_df1['Category0'])[0]
qn_df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Questions,Category0,Category Vectors
0,How did serfdom develop in and then leave Russ...,DESCRIPTION,0
1,What films featured the character Popeye Doyle ?,ENTITY,1
2,How can I find a list of celebrities ' real na...,DESCRIPTION,0
3,What fowl grabs the spotlight after the Chines...,ENTITY,1
4,What is the full form of .com ?,ABBREVIATION,2


In [5]:
vect = TfidfVectorizer(ngram_range = (1,2)).fit(qn_df1['Questions'])

In [6]:
pickle.dump(vect, open("tfidf.pickle", "wb"))

#### Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(qn_df1['Questions'], qn_df1['Category Vectors'], test_size=0.2, random_state=0)

In [8]:
train_vector = vect.transform(X_train)

In [9]:
test_vector = vect.transform(X_test)

#### SVM

In [10]:
model1 = SVC(kernel='linear', probability = True)

In [11]:
model1.fit(train_vector, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
pred1 = model1.predict(test_vector)

In [13]:
accuracy_score(pred1, y_test)

0.85059578368469291

#### Save Model

In [14]:
pkl_filename = "model1.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(model1, file)

#### Apply Threshold

In [15]:
max_prob, max_prob_args = [],[]

prob = model1.predict_proba(test_vector)
for i in range(len(prob)):
    max_prob.append(prob[i].max())
    if prob[i].max() > 0.8:
        max_prob_args.append(prob[i].argmax())
    else:
        max_prob_args.append(-1)

In [16]:
a = pd.DataFrame(X_test)
a['pred'] = max_prob_args
a['actual'] = y_test
a['max_prob'] = max_prob

In [17]:
b = a[a['pred'] != -1]   ### 809 out of 1091 datapoints

In [18]:
accuracy_score(b['pred'], b['actual'])

0.957286432160804

### Category2 Analysis

In [19]:
qn_df2 = qn_df[['Questions', 'Category2']]
qn_df2.head()

Unnamed: 0,Questions,Category2
0,How did serfdom develop in and then leave Russ...,manner
1,What films featured the character Popeye Doyle ?,cremat
2,How can I find a list of celebrities ' real na...,manner
3,What fowl grabs the spotlight after the Chines...,animal
4,What is the full form of .com ?,exp


#### Vectorization

In [20]:
qn_df2['Category Vectors'] = pd.factorize(qn_df2['Category2'])[0]
qn_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Questions,Category2,Category Vectors
0,How did serfdom develop in and then leave Russ...,manner,0
1,What films featured the character Popeye Doyle ?,cremat,1
2,How can I find a list of celebrities ' real na...,manner,0
3,What fowl grabs the spotlight after the Chines...,animal,2
4,What is the full form of .com ?,exp,3


#### Train Test Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(qn_df2['Questions'], qn_df2['Category Vectors'], test_size=0.2, random_state=0)

In [22]:
train_vector = vect.transform(X_train)

In [23]:
test_vector = vect.transform(X_test)

#### SVM

In [24]:
model2 = SVC(kernel='linear', probability = True)

In [25]:
model2.fit(train_vector, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
pred2 = model2.predict(test_vector)

In [27]:
accuracy_score(pred2, y_test)

0.70577451879010078

#### Save Model

In [28]:
pkl_filename = "model2.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(model2, file)

#### Apply Threshold

In [29]:
max_prob, max_prob_args = [],[]

prob = model2.predict_proba(test_vector)
for i in range(len(prob)):
    max_prob.append(prob[i].max())
    if prob[i].max() > 0.8:
        max_prob_args.append(prob[i].argmax())
    else:
        max_prob_args.append(-1)

In [30]:
a = pd.DataFrame(X_test)
a['pred'] = max_prob_args
a['actual'] = y_test
a['max_prob'] = max_prob

In [31]:
b = a[a['pred'] != -1]   ### 521 out of 1091 datapoints

In [32]:
accuracy_score(b['pred'], b['actual'])

0.962890625

### Create Reference Dictionary

In [33]:
dict_cat0 = {}

for val in qn_df1['Category0'].unique():
    dict_cat0[val] = qn_df1[qn_df1['Category0'] == val]['Category Vectors'].unique()[0]

In [34]:
dict_cat1 = {}

for val in qn_df2['Category2'].unique():
    dict_cat1[val] = qn_df2[qn_df2['Category2'] == val]['Category Vectors'].unique()[0]

#### Save Reference Dictionary

In [35]:
pkl_filename = "dict_cat0.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(dict_cat0, file)

In [36]:
pkl_filename = "dict_cat1.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(dict_cat1, file)