### Baseline of the classification model using TF-IDF vectorization

In [20]:
import os
import pickle
from sklearn.svm import SVC
from sklearn.metrics import *
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
from utility.utils import json_2_dataframe
from utility.utils import train_test_spliter
from utility.utils import corpus_entity_info
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from utility.feature_utility import featurized_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
MODEL_MAPPING = {'random_forest':RandomForestClassifier(n_estimators = 10, max_depth = 3, random_state = 42),
                'logistic': LogisticRegression(C=10, random_state = 42),
                'svm':SVC(C=10),
                'knn':KNeighborsClassifier(),
                'decision_tree': DecisionTreeClassifier()}

PARAMETER_MAPPING ={'random_forest': {'n_estimators': list(range(10,20)), 'max_depth': [3] },
                    'logistic': {'penalty':('l1','l2'), 'C':[5,10] },
                    'svm': {'kernel':('linear', 'rbf'), 'C':[5, 10]},
                    'knn': {},
                    'decision_tree': {'max_depth':[3],'min_samples_leaf' :[2,3,4,5]},   
                   }

## data reader and split into train and test data

In [3]:
dataset = json_2_dataframe('../data/ChatbotCorpus.json')
splited_data  = train_test_spliter(dataset)

### preparing text data for classification

In [4]:
X_train, X_test, y_train, y_test = featurized_data(splited_data, 'tf-idf')

In [11]:
def train_model(model_name, X_train, y_train):
    if model_name in MODEL_MAPPING.keys():
        model = MODEL_MAPPING[model_name]
        parameters = PARAMETER_MAPPING[model_name]
        clf = GridSearchCV(model, parameters, cv=5)
        clf.fit(X_train, y_train)
        return clf
    else:
        print(f"please pass the model name one of these : {list(model_mapping.keys())}")
        
def get_classfication_report(model, X_test, y_test):
    print('\n Classification Report :\n')
    print(classification_report(y_test, model.predict(X_test)))
    
def tf_idf_reader():
    with open('model/tfidf.pkl','rb') as f:
        tfidf = pickle.load(f)
    return tfidf
    
def predict_sample(model, sample, feature='tf-idf'):
    tfidf = tf_idf_reader()
    if isinstance(sample,str):
        sample = [sample]
    sample_vector = tfidf.transform(sample)
    return model.predict(sample_vector)

def get_confusion_matrix(model,X_test, y_test):
    print('Confusion Matrix :\n')
    print(confusion_matrix(y_test, model.predict(X_test)))
    
def get_false_positive(model, data, true_lable):
    text = data
    model_pred = predict_sample(model, data)
    print("false positive sample")
    for count , (actual,predict) in enumerate(zip(true_lable, model_pred)):
        if actual == 'FindConnection' and predict == 'DepartureTime':
            print(f"sentence : {text[count]}\nActual Label : {actual}\tPredict Label : {predict}\n\n")
    

### LogisticRegression 

In [6]:
clf_model = train_model('logistic',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)

Confusion Matrix :

[[29  6]
 [ 0 71]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       1.00      0.83      0.91        35
FindConnection       0.92      1.00      0.96        71

     micro avg       0.94      0.94      0.94       106
     macro avg       0.96      0.91      0.93       106
  weighted avg       0.95      0.94      0.94       106





#### Observation:
    - Precision for Departure time is 100% and recall for FindConnection is 100%.
    - Model did properly generalized for Departure time

### DecisionTree

In [7]:
clf_model = train_model('decision_tree',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)

Confusion Matrix :

[[33  2]
 [ 0 71]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       1.00      0.94      0.97        35
FindConnection       0.97      1.00      0.99        71

     micro avg       0.98      0.98      0.98       106
     macro avg       0.99      0.97      0.98       106
  weighted avg       0.98      0.98      0.98       106





#### Observation:
    - Decision tree classification model is better as compare to logistic regression.
    - Its actully reduces the false positive



### knn

In [8]:
clf_model = train_model('knn',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)

Confusion Matrix :

[[35  0]
 [ 3 68]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.92      1.00      0.96        35
FindConnection       1.00      0.96      0.98        71

     micro avg       0.97      0.97      0.97       106
     macro avg       0.96      0.98      0.97       106
  weighted avg       0.97      0.97      0.97       106



#### Observation:
    - Precision for FindConnection is 100% and recall for DepartureTime is 100%.
    - Model did properly generalized for FindConnection

### Random Forest

In [9]:
clf_model = train_model('random_forest',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)

Confusion Matrix :

[[33  2]
 [ 3 68]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.92      0.94      0.93        35
FindConnection       0.97      0.96      0.96        71

     micro avg       0.95      0.95      0.95       106
     macro avg       0.94      0.95      0.95       106
  weighted avg       0.95      0.95      0.95       106





#### Observation
    - It is not working well with small amount of data.
    - Error rate is more in predicting departuretime category.

### SVC

In [10]:
clf_model = train_model('svm',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)

Confusion Matrix :

[[35  0]
 [ 1 70]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.97      1.00      0.99        35
FindConnection       1.00      0.99      0.99        71

     micro avg       0.99      0.99      0.99       106
     macro avg       0.99      0.99      0.99       106
  weighted avg       0.99      0.99      0.99       106





In [14]:
get_false_positive(clf_model, splited_data.test.text.values, y_test)

false positive sample
sentence : when is the next rocket from winterstraße 12 to kieferngarte
Actual Label : FindConnection	Predict Label : DepartureTime




In [19]:
predict_sample(clf_model,['when is it going'])

array(['DepartureTime'], dtype=object)

#### Observation
    - The result seems pretty decent
    - If vocabulary size is increased or dataset is increased, then training data become sparse,
      where SVC take time to train.