### Baseline of the classification model using TF-IDF vectorization

In [1]:
import os
import pickle
from sklearn.svm import SVC
from sklearn.metrics import *
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
from utility.utils import json_2_dataframe
from utility.utils import train_test_spliter
from utility.feature_utility import featurized_data
from classification.train import train_model, save_model

[MLENS] backend: threading


In [2]:
from classification.eval import get_confusion_matrix
from classification.eval import get_classfication_report

## data reader and split into train and test data

In [3]:
dataset = json_2_dataframe('../data/ChatbotCorpus.json')
splited_data  = train_test_spliter(dataset)

### preparing text data for classification

In [4]:
X_train, X_test, y_train, y_test = featurized_data(splited_data, 'tf-idf')

In [5]:
def tf_idf_reader():
    with open('model/tfidf.pkl','rb') as f:
        tfidf = pickle.load(f)
    return tfidf
    
def predict_sample(model, sample, feature='tf-idf'):
    if isinstance(sample,str):
        sample = [sample]
    if feature == 'tf-idf':
        tfidf = tf_idf_reader()
        sample_vector = tfidf.transform(sample)
    else:
        sample_vector = use_vectorizer(sample)
    return model.predict(sample_vector)

    
def get_false_positive(model, data, true_lable, feature):
    text = data
    model_pred = predict_sample(model, data, feature)
    print("false positive sample")
    for count , (actual,predict) in enumerate(zip(true_lable, model_pred)):
        if actual == 'FindConnection' and predict == 'DepartureTime':
            print(f"sentence : {text[count]}\nActual Label : {actual}\tPredict Label : {predict}\n\n")
            
def get_false_negative(model, data, true_lable, feature):
    text = data
    model_pred = predict_sample(model, data, feature)
    print("false negative sample")
    for count , (actual,predict) in enumerate(zip(true_lable, model_pred)):
        if actual == 'DepartureTime' and predict == 'FindConnection':
            print(f"sentence : {text[count]}\nActual Label : {actual}\tPredict Label : {predict}\n\n")

### LogisticRegression 

In [10]:
clf_model = train_model('logistic',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)
save_model(model = clf_model,filepath = 'model/classification/logistic_tfidf.sav')

Confusion Matrix :

[[31  4]
 [ 0 71]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       1.00      0.89      0.94        35
FindConnection       0.95      1.00      0.97        71

     micro avg       0.96      0.96      0.96       106
     macro avg       0.97      0.94      0.96       106
  weighted avg       0.96      0.96      0.96       106





#### Observation:
    - Precision for Departure time is 100% and recall for FindConnection is 100%.
    - Model did properly generalized for Departure time

### DecisionTree

In [19]:
clf_model = train_model('decision_tree',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)
save_model(model = clf_model,filepath = 'model/classification/decision_tree_tfidf.sav')

Confusion Matrix :

[[33  2]
 [ 3 68]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.92      0.94      0.93        35
FindConnection       0.97      0.96      0.96        71

     micro avg       0.95      0.95      0.95       106
     macro avg       0.94      0.95      0.95       106
  weighted avg       0.95      0.95      0.95       106





#### Observation:
    - Decision tree classification model is better as compare to logistic regression.
    - Its actully reduces the false positive but the same time it increases the false negative



### knn

In [12]:
clf_model = train_model('knn',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)
save_model(model = clf_model,filepath = 'model/classification/knn_tfidf.sav')

Confusion Matrix :

[[35  0]
 [ 3 68]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.92      1.00      0.96        35
FindConnection       1.00      0.96      0.98        71

     micro avg       0.97      0.97      0.97       106
     macro avg       0.96      0.98      0.97       106
  weighted avg       0.97      0.97      0.97       106





#### Observation:
    - Precision for FindConnection is 100% and recall for DepartureTime is 100%.
    - Model did properly generalized for FindConnection

### Random Forest

In [13]:
clf_model = train_model('random_forest',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)
save_model(model = clf_model,filepath = 'model/classification/random_forest_tfidf.sav')

Confusion Matrix :

[[31  4]
 [ 3 68]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.91      0.89      0.90        35
FindConnection       0.94      0.96      0.95        71

     micro avg       0.93      0.93      0.93       106
     macro avg       0.93      0.92      0.92       106
  weighted avg       0.93      0.93      0.93       106





#### Observation
    - It is not working well with small amount of data.
    - Error rate is more in predicting departuretime category.

### SVC

In [14]:
clf_model = train_model('svm',X_train, y_train)
get_confusion_matrix(clf_model,X_test, y_test)
get_classfication_report(clf_model, X_test, y_test)
save_model(model = clf_model,filepath = 'model/classification/svc_tfidf.sav')

Confusion Matrix :

[[35  0]
 [ 1 70]]

 Classification Report :

                precision    recall  f1-score   support

 DepartureTime       0.97      1.00      0.99        35
FindConnection       1.00      0.99      0.99        71

     micro avg       0.99      0.99      0.99       106
     macro avg       0.99      0.99      0.99       106
  weighted avg       0.99      0.99      0.99       106





In [16]:
get_false_positive(clf_model, splited_data.test.text.values, y_test,feature='tf-idf')

false positive sample
sentence : when is the next rocket from winterstraße 12 to kieferngarte
Actual Label : FindConnection	Predict Label : DepartureTime




In [17]:
predict_sample(clf_model,['when is it going'])

array(['DepartureTime'], dtype=object)

#### Observation
    - The result seems pretty decent
    - If vocabulary size is increased or dataset is increased, then training data become sparse,
      where SVC take time to train.