In [1]:
import os
import pickle
from sklearn.svm import SVC
from sklearn.metrics import *
from ner.train import custom_train
from ner.eval import ner_model_evalution
from sklearn.metrics import accuracy_score
from utility.utils import json_2_dataframe
from utility.utils import train_test_spliter
from utility.utils import spacy_data_conversion
from utility.feature_utility import featurized_data
from utility.data_augmentation_utility import fake_data_generation

In [2]:
data = json_2_dataframe('../data/ChatbotCorpus.json')
splited_data = train_test_spliter(data)
train_data, test_data = splited_data.train, splited_data.test

In [3]:
import pandas as pd
from sklearn.utils import shuffle

### Generating the fake data

In [5]:
train_fake_data = fake_data_generation(train_data, training=True)
test_fake_data = fake_data_generation(test_data,training=False)
actual_train_ner = pd.concat([pd.DataFrame(ent) for ent in train_data.entities.values])
fake_train_ner = pd.concat([pd.DataFrame(ent) for ent in train_fake_data.entities.values])

### Stats between actual data and fake data samples

In [6]:
from utility.utils import corpus_entity_info
ent_infos = corpus_entity_info(actual_train_ner, fake_train_ner,col1="actual data",col2="augmented data")
ent_infos

Unnamed: 0_level_0,actual data,augmented data
entity,Unnamed: 1_level_1,Unnamed: 2_level_1
Criterion,50,309
Line,2,11
StationDest,57,331
StationStart,91,547
TimeEndTime,2,8
TimeStartTime,5,24
Vehicle,50,303


### Merge fake data with  actual data

In [7]:
trainging_merge_data = pd.concat([train_data,train_fake_data])
trainging_merge_data = shuffle(trainging_merge_data)
formated_training_data = spacy_data_conversion(trainging_merge_data,mode='train')

In [8]:
train_ner = pd.concat([pd.DataFrame(ent) for ent in trainging_merge_data.entities.values])
labels = train_ner.entity.unique()

### Training spacy small ner model with data augmentation

In [9]:
small_ner = custom_train(formated_training_data,labels, model = 'en_core_web_sm' ,output_dir='model/ner_model/small_augment_ner',n_iter=20,verbose=False)

Loaded model 'en_core_web_sm'
model is trained


In [10]:
train_data_scorecard = ner_model_evalution(small_ner, spacy_data_conversion(trainging_merge_data, mode='evalution'))
test_data_scorecard = ner_model_evalution(small_ner, spacy_data_conversion(test_data, mode='evalution'))

### Small ner model evalution

In [11]:
train_data_scorecard

index,f1_score,precision_score,recall_score
overall,94.692264,94.852941,94.532131
Criterion,98.351648,97.01897,99.721448
StationStart,94.611727,94.462025,94.761905
Vehicle,92.090395,91.830986,92.351275
StationDest,93.691275,95.616438,91.842105
TimeStartTime,100.0,100.0,100.0
TimeEndTime,100.0,100.0,100.0
Line,76.190476,100.0,61.538462


In [12]:
test_data_scorecard

index,f1_score,precision_score,recall_score
overall,93.991416,98.206278,90.123457
StationStart,92.307692,96.774194,88.235294
StationDest,94.814815,100.0,90.140845
Criterion,96.969697,100.0,94.117647
Vehicle,95.652174,97.058824,94.285714
Line,0.0,0.0,0.0
