<a href="https://colab.research.google.com/github/EleonoraBaim/NPS_Dialogue_system/blob/main/NB_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and importing libraries

In [3]:
import os
import json
import numpy as np
import pandas as pd
import re
import pickle
import time
from sklearn.multioutput import MultiOutputRegressor
from sklearn.naive_bayes import MultinomialNB
from catboost import Pool, CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, hamming_loss, accuracy_score

project_path = "/content/drive/MyDrive/Colab_Notebooks/NPS_dialogue_system/"
#project_path = "/NPS/"

min_df = 1/1000
max_df = 1/20
dataset_folder = "ready_datasets/"
model_path = 'model_parts/'

In [2]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 72 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [4]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


# Functions

In [5]:
def tf_idf_transform (data, tf_idf_model):
  import tqdm

  feature_names_text = tf_idf_model.get_feature_names()
  data_text = pd.DataFrame(tf_idf_model.transform(data).toarray())
  
  data_text_names = pd.DataFrame()
  for column in tqdm.tqdm(data_text.columns):
    data_text_names[str('T ' + feature_names_text[column])] = data_text[column]
    
  data = data_text_names
  data = data.fillna('NaN')
  return data

# 1.0 Data Importing

In [6]:
#IMPORTING DATA
print('Importing data...', '\n')

train_df = pd.read_csv(str(project_path + dataset_folder + 'train_dataset.csv'))
train_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

test_df = pd.read_csv(str(project_path + dataset_folder + 'test_dataset.csv'))
test_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

val_df = pd.read_csv(str(project_path + dataset_folder + 'val_dataset.csv'))
val_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

text = 'CONTEXT'
norm_text = 'normalized'
target_list = train_df.columns[(train_df.columns!='CONTEXT')&(train_df.columns!='normalized')]

Importing data... 



# 2.0 Data Vectorization

In [7]:
print('Text Vectorization: ')

# Создаем векторизатор
tfidf_text = TfidfVectorizer(ngram_range=(1, 2), min_df=min_df, max_df=max_df)
tfidf_text.fit(train_df[norm_text])

print('Vocab size text: ' + str(len(tfidf_text.vocabulary_)))

with open(str(project_path + model_path +  'tfidf_vectorizer_text.pickle'), 'wb') as handle:
    pickle.dump(tfidf_text, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Text vectoriztor saved to ' + str(project_path + model_path +'/'+'tfidf_vectorizer_text.pickle'))

train_data = tf_idf_transform(train_df[norm_text], tfidf_text)
test_data = tf_idf_transform(test_df[norm_text], tfidf_text)
val_data = tf_idf_transform(val_df[norm_text], tfidf_text)

tfdf_cols = train_data.columns

train_data = train_data.join(train_df)
val_data = val_data.join(val_df)
test_data = test_data.join(test_df)

Text Vectorization: 
Vocab size text: 2175
Text vectoriztor saved to /content/drive/MyDrive/Colab_Notebooks/NPS_dialogue_system/model_parts//tfidf_vectorizer_text.pickle


100%|██████████| 2175/2175 [00:07<00:00, 310.32it/s]
100%|██████████| 2175/2175 [00:02<00:00, 907.54it/s]
100%|██████████| 2175/2175 [00:02<00:00, 753.54it/s]


# 3.0 Model training


## 3.1 NB

In [8]:
model_NB = MultiOutputRegressor(MultinomialNB())

start_time = time.time()
model_NB.fit(train_data[tfdf_cols], train_data[target_list])
print('Time of training: ', "--- %s seconds ---" % (time.time() - start_time))

preds_NB = model_NB.predict(test_data[tfdf_cols])

print('Classification report: \n', classification_report(test_data[target_list], preds_NB))
print('Hamming Loss: \n', hamming_loss(test_data[target_list], preds_NB))

Time of training:  --- 12.89266037940979 seconds ---
Classification report: 
               precision    recall  f1-score   support

           0       0.62      0.33      0.43       146
           1       0.83      0.39      0.53       284
           2       0.70      0.33      0.45       482
           3       0.64      0.05      0.10       133
           4       0.77      0.20      0.32       458
           5       0.93      0.29      0.45        95
           6       0.80      0.56      0.66       321
           7       0.74      0.47      0.58       525
           8       0.88      0.30      0.45       184
           9       0.81      0.27      0.40       160
          10       0.96      0.27      0.42       250
          11       0.88      0.24      0.38       123
          12       0.90      0.13      0.22       218
          13       0.86      0.23      0.36       502
          14       0.75      0.55      0.63       498
          15       0.86      0.17      0.29       181
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 3.2 Cat Boost

In [9]:
task_type = 'CPU'

model_CB = CatBoostClassifier(iterations=3000, random_state=1, learning_rate=0.05, verbose=1000,
                           od_type="Iter", od_wait=200, eval_metric='HammingLoss', use_best_model=True, depth=4, loss_function='MultiCrossEntropy',task_type=task_type)

p_train = Pool(train_data[tfdf_cols], train_data[target_list])
p_val = Pool(val_data[tfdf_cols], val_data[target_list])

start_time = time.time()
model_CB.fit(p_train, eval_set=p_val)
print('Time of training: ', "--- %s seconds ---" % (time.time() - start_time))

0:	learn: 0.0403743	test: 0.0390511	best: 0.0390511 (0)	total: 4.09s	remaining: 3h 24m 33s
1000:	learn: 0.0283614	test: 0.0283666	best: 0.0283604 (998)	total: 54m	remaining: 1h 47m 50s
2000:	learn: 0.0242994	test: 0.0256099	best: 0.0256099 (2000)	total: 1h 46m 16s	remaining: 53m 3s
2999:	learn: 0.0217886	test: 0.0241444	best: 0.0241444 (2999)	total: 2h 38m 32s	remaining: 0us

bestTest = 0.02414444351
bestIteration = 2999

Time of training:  --- 9514.871947288513 seconds ---


In [10]:
model_CB.save_model(str(project_path+model_path+'cat_boost_v2.cbm'),
           format="cbm",
           export_parameters=None,
           pool=None)

In [11]:
preds_CB = model_CB.predict(test_data[tfdf_cols])

print('Classification report: ',classification_report(test_data[target_list], preds_CB))
print("Hamming Loss: ", hamming_loss(test_data[target_list], preds_CB))

Classification report:                precision    recall  f1-score   support

           0       0.79      0.53      0.63       146
           1       0.81      0.85      0.83       284
           2       0.78      0.37      0.50       482
           3       0.76      0.20      0.31       133
           4       0.83      0.32      0.46       458
           5       0.87      0.42      0.57        95
           6       0.85      0.58      0.69       321
           7       0.79      0.50      0.61       525
           8       0.85      0.86      0.86       184
           9       0.78      0.61      0.69       160
          10       0.82      0.77      0.79       250
          11       0.83      0.57      0.68       123
          12       0.74      0.44      0.55       218
          13       0.80      0.34      0.47       502
          14       0.81      0.54      0.65       498
          15       0.82      0.88      0.85       181
          16       0.56      0.07      0.13       196
   

  _warn_prf(average, modifier, msg_start, len(result))
