# libs

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from pycaret.classification import *

# product metric

- **продукт**: чат-бот для психологической поддержки (можно взять готовую модель bloom https://colab.research.google.com/drive/1Ervk6HPNS6AYVr3xVdQnY5a-TjjmLCdQ). каждый ответ человека классифицируется по эмоциям. в итоге, мы можем посчитать какая эмоция превалировала в процентном соотношении за каждый разговор/ день/ неделю. эту информацию можно как превращать в столбчатые диаграммы с изменяемым периодом, чтобы человек сам видел как он себя чувствует, как его настроение меняется в каких ситуациях, так и для бизнес-метрики (ретеншен), т к она вероятнее всего будет коррелировать с тем, меняется ли настроение человека к концу разговора с ботом относительно начала. например, любой разговор (коммуникация, перерывы в которой не превышают 1 час) будем делить на 4 равные части из сообщений, по первой замерять преобладающее настроение человека в момент обращения к боту, а по последней - настроение после "работы" с ботом. если процент "хороших" эмоций повышается, то значит что бот помогает справляться с тяжелым состоянием, и ожидается, что такие люди будут с большей вероятностью возвращаться к использованию продукта. также это в целом полезный показатель удовлетворенности клиента в случае с таким нетиповым продуктом. 
- **задача ML**: многоклассовая классификация
- **целевая переменная**: метки эмоций
- **бизнесовая метрика**: выдвигаем гипотезу, что на RR влияет процент "хороших" эмоций к концу диалога с ботом относительно начала диалога
- **ML-метрики**: средняя F-мера, индивидуальные AUC-PRC и AUC-ROC

# data

labels:
 - 0 > sadness
 - 1 > joy
 - 2 > love
 - 3 > anger
 - 4 > fear
 - 5 > surprise

In [2]:
df_train = pd.read_csv('data/training.csv')
df_test = pd.read_csv('data/test.csv')
df_val = pd.read_csv('data/validation.csv')

In [8]:
df_train.label.value_counts()

1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: label, dtype: int64

In [6]:
df_test.label.value_counts()

1    695
0    581
3    275
4    224
2    159
5     66
Name: label, dtype: int64

In [7]:
df_val.label.value_counts()

1    704
0    550
3    275
4    212
2    178
5     81
Name: label, dtype: int64

In [3]:
df_train.label.value_counts()

1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: label, dtype: int64

In [9]:
vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)

X_train = vect.fit_transform(df_train.text.values)
X_test = vect.transform(df_test.text.values)
X_val = vect.transform(df_val.text.values)

In [5]:
X_train.shape

(16000, 15186)

In [13]:
tsvd = TruncatedSVD(n_components=10)
X_train_pca = tsvd.fit_transform(X_train)
X_test_pca = tsvd.transform(X_test)
X_val_pca = tsvd.transform(X_val)

In [14]:
train = pd.concat([pd.DataFrame(X_train_pca),df_train.label],axis=1)
test = pd.concat([pd.DataFrame(X_test_pca),df_test.label],axis=1)
val = pd.concat([pd.DataFrame(X_val_pca),df_val.label],axis=1)

# baseline

In [32]:
pd.concat([train[:100],test[:20]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,0.064295,-0.063298,0.055273,0.018179,-0.041933,-0.002546,0.040155,0.043644,-0.018218,0.002119,0
1,0.188749,0.025860,0.079407,0.096769,-0.071298,0.026928,-0.126673,-0.029908,-0.012344,-0.038522,0
2,0.094186,0.042580,0.114990,-0.099052,0.010857,-0.003603,-0.041074,0.035312,0.033833,-0.015461,3
3,0.229839,0.064610,-0.054677,0.078330,0.079268,-0.009854,0.165113,-0.179847,0.051899,0.010193,2
4,0.103607,0.250481,-0.025022,0.230799,0.158933,-0.158567,0.062794,-0.022759,-0.030968,-0.056590,3
...,...,...,...,...,...,...,...,...,...,...,...
15,0.136640,0.231361,-0.022152,-0.039700,-0.012256,0.198636,0.025453,0.066841,-0.045357,-0.027142,0
16,0.085233,0.050516,0.102207,0.006409,-0.047244,-0.038810,0.070670,0.070791,0.015270,0.049842,1
17,0.218618,-0.077997,0.027761,-0.021665,0.018544,-0.042400,-0.038324,0.159485,0.004024,-0.040020,0
18,0.127300,-0.043211,0.007999,-0.012350,-0.002555,0.018594,0.007719,-0.013600,0.071187,-0.013248,3


In [38]:
tte = test[:20]

In [41]:
tte.index = range(100,120)
tte

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
100,0.16388,0.27235,0.242446,-0.020444,-0.068234,-0.041986,-0.070683,-0.051176,-0.020481,0.017386,0
101,0.114175,0.061004,0.103031,-0.11121,-0.051528,-0.030802,-0.00528,0.04151,0.034446,-0.027666,0
102,0.189592,-0.114625,0.05865,-0.033518,0.066533,0.032073,-0.038949,0.10918,-0.058682,-0.031581,0
103,0.147417,0.072203,-0.079598,-0.011048,-0.04391,-0.075704,0.001773,0.103432,0.023868,-0.017379,1
104,0.169207,0.18644,-0.078553,0.041872,-0.020063,0.158771,0.076677,0.048596,-0.098654,0.10239,0
105,0.087093,-0.036787,0.053327,-0.010109,0.049951,-0.029737,0.014871,-0.015254,-0.006525,-0.018181,4
106,0.105015,-0.006621,-0.116554,-0.05405,-0.071017,-0.045596,0.024406,-0.052568,-0.006811,-0.040433,3
107,0.192887,0.036153,-0.143755,-0.012908,-0.049269,-0.000688,-0.047508,0.008819,-0.026253,0.046439,1
108,0.207263,0.005394,-0.035449,-0.048071,0.106426,0.019772,-0.066214,0.032219,0.031674,0.080094,1
109,0.223096,0.07312,0.006107,0.01324,-0.00885,-0.063909,0.053235,-0.025978,0.079963,0.017468,3


In [40]:
tte

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,0.16388,0.27235,0.242446,-0.020444,-0.068234,-0.041986,-0.070683,-0.051176,-0.020481,0.017386,0
1,0.114175,0.061004,0.103031,-0.11121,-0.051528,-0.030802,-0.00528,0.04151,0.034446,-0.027666,0
2,0.189592,-0.114625,0.05865,-0.033518,0.066533,0.032073,-0.038949,0.10918,-0.058682,-0.031581,0
3,0.147417,0.072203,-0.079598,-0.011048,-0.04391,-0.075704,0.001773,0.103432,0.023868,-0.017379,1
4,0.169207,0.18644,-0.078553,0.041872,-0.020063,0.158771,0.076677,0.048596,-0.098654,0.10239,0
5,0.087093,-0.036787,0.053327,-0.010109,0.049951,-0.029737,0.014871,-0.015254,-0.006525,-0.018181,4
6,0.105015,-0.006621,-0.116554,-0.05405,-0.071017,-0.045596,0.024406,-0.052568,-0.006811,-0.040433,3
7,0.192887,0.036153,-0.143755,-0.012908,-0.049269,-0.000688,-0.047508,0.008819,-0.026253,0.046439,1
8,0.207263,0.005394,-0.035449,-0.048071,0.106426,0.019772,-0.066214,0.032219,0.031674,0.080094,1
9,0.223096,0.07312,0.006107,0.01324,-0.00885,-0.063909,0.053235,-0.025978,0.079963,0.017468,3


In [42]:
clf = setup(data=train[:100], test_data=tte, target='label', fold=3)

Unnamed: 0,Description,Value
0,Session id,4610
1,Target,label
2,Target type,Multiclass
3,Original data shape,"(120, 11)"
4,Transformed data shape,"(120, 11)"
5,Transformed train set shape,"(100, 11)"
6,Transformed test set shape,"(20, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


In [43]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.4201,0.0,0.4201,0.2886,0.3139,0.1324,0.1843,1.38
ada,Ada Boost Classifier,0.4002,0.5422,0.4002,0.2918,0.3183,0.1437,0.1632,1.68
qda,Quadratic Discriminant Analysis,0.399,0.5499,0.399,0.2941,0.3287,0.1343,0.1434,1.1033
lightgbm,Light Gradient Boosting Machine,0.3696,0.5719,0.3696,0.3273,0.3396,0.132,0.1384,4.52
knn,K Neighbors Classifier,0.3607,0.5599,0.3607,0.2563,0.2957,0.0939,0.0986,5.6233
et,Extra Trees Classifier,0.3506,0.5681,0.3506,0.2793,0.3083,0.0923,0.0959,2.1467
rf,Random Forest Classifier,0.3503,0.558,0.3503,0.2824,0.3071,0.0825,0.0884,2.1633
lr,Logistic Regression,0.35,0.5897,0.35,0.1226,0.1816,0.0,0.0,15.7367
dummy,Dummy Classifier,0.35,0.5,0.35,0.1226,0.1816,0.0,0.0,1.2
lda,Linear Discriminant Analysis,0.3307,0.5511,0.3307,0.2808,0.2967,0.0785,0.0815,1.4967


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize='deprecated', positive=False,
                random_state=4610, solver='auto', tol=0.001)

In [20]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.3917,0.0,0.3917,0.2457,0.2899,0.0945,0.1158,1.02
lr,Logistic Regression,0.35,0.5681,0.35,0.2893,0.2148,0.0172,0.0542,14.39
rf,Random Forest Classifier,0.35,0.5568,0.35,0.2855,0.3129,0.0955,0.0984,2.7067
qda,Quadratic Discriminant Analysis,0.35,0.5385,0.35,0.2725,0.2974,0.0665,0.0709,1.27
dummy,Dummy Classifier,0.3417,0.5,0.3417,0.1169,0.1741,0.0,0.0,1.1333
knn,K Neighbors Classifier,0.3333,0.5359,0.3333,0.2556,0.2858,0.0674,0.0711,5.4267
xgboost,Extreme Gradient Boosting,0.3333,0.5623,0.3333,0.3168,0.3228,0.0984,0.0995,1.77
et,Extra Trees Classifier,0.325,0.5266,0.325,0.2719,0.2943,0.0646,0.0661,2.38
lda,Linear Discriminant Analysis,0.3167,0.547,0.3167,0.2797,0.293,0.0551,0.0566,1.24
svm,SVM - Linear Kernel,0.3083,0.0,0.3083,0.227,0.2539,0.0524,0.0573,1.0533


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize='deprecated', positive=False,
                random_state=8085, solver='auto', tol=0.001)

In [27]:
get_config('X_test_transformed')[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.064295,-0.063298,0.055273,0.018179,-0.041933,-0.002546,0.040155,0.043644,-0.018218,0.002119
0,0.16388,0.27235,0.242446,-0.020444,-0.068234,-0.041986,-0.070683,-0.051176,-0.020481,0.017386
1,0.188749,0.02586,0.079407,0.096769,-0.071298,0.026928,-0.126673,-0.029908,-0.012344,-0.038522
1,0.114175,0.061004,0.103031,-0.11121,-0.051528,-0.030802,-0.00528,0.04151,0.034446,-0.027666
2,0.094186,0.04258,0.11499,-0.099052,0.010857,-0.003603,-0.041074,0.035312,0.033833,-0.015461


In [28]:
train[:100][:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,0.064295,-0.063298,0.055273,0.018179,-0.041933,-0.002546,0.040155,0.043644,-0.018218,0.002119,0
1,0.188749,0.02586,0.079407,0.096769,-0.071298,0.026928,-0.126673,-0.029908,-0.012344,-0.038522,0
2,0.094186,0.04258,0.11499,-0.099052,0.010857,-0.003603,-0.041074,0.035312,0.033833,-0.015461,3
3,0.229839,0.06461,-0.054677,0.07833,0.079268,-0.009854,0.165113,-0.179847,0.051899,0.010193,2
4,0.103607,0.250481,-0.025022,0.230799,0.158933,-0.158567,0.062794,-0.022759,-0.030968,-0.05659,3


In [29]:
get_config('X_train_transformed')[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.064295,-0.063298,0.055273,0.018179,-0.041933,-0.002546,0.040155,0.043644,-0.018218,0.002119
0,0.16388,0.27235,0.242446,-0.020444,-0.068234,-0.041986,-0.070683,-0.051176,-0.020481,0.017386
1,0.188749,0.02586,0.079407,0.096769,-0.071298,0.026928,-0.126673,-0.029908,-0.012344,-0.038522
1,0.114175,0.061004,0.103031,-0.11121,-0.051528,-0.030802,-0.00528,0.04151,0.034446,-0.027666
2,0.094186,0.04258,0.11499,-0.099052,0.010857,-0.003603,-0.041074,0.035312,0.033833,-0.015461


In [30]:
test[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,0.16388,0.27235,0.242446,-0.020444,-0.068234,-0.041986,-0.070683,-0.051176,-0.020481,0.017386,0
1,0.114175,0.061004,0.103031,-0.11121,-0.051528,-0.030802,-0.00528,0.04151,0.034446,-0.027666,0
2,0.189592,-0.114625,0.05865,-0.033518,0.066533,0.032073,-0.038949,0.10918,-0.058682,-0.031581,0
3,0.147417,0.072203,-0.079598,-0.011048,-0.04391,-0.075704,0.001773,0.103432,0.023868,-0.017379,1
4,0.169207,0.18644,-0.078553,0.041872,-0.020063,0.158771,0.076677,0.048596,-0.098654,0.10239,0


In [21]:
get_config()

{'USI',
 'X',
 'X_test',
 'X_test_transformed',
 'X_train',
 'X_train_transformed',
 'X_transformed',
 '_available_plots',
 '_ml_usecase',
 'data',
 'dataset',
 'dataset_transformed',
 'exp_id',
 'exp_name_log',
 'fix_imbalance',
 'fold_generator',
 'fold_groups_param',
 'fold_shuffle_param',
 'gpu_n_jobs_param',
 'gpu_param',
 'html_param',
 'idx',
 'is_multiclass',
 'log_plots_param',
 'logging_param',
 'memory',
 'n_jobs_param',
 'pipeline',
 'seed',
 'target_param',
 'test',
 'test_transformed',
 'train',
 'train_transformed',
 'variable_and_property_keys',
 'variables',
 'y',
 'y_test',
 'y_test_transformed',
 'y_train',
 'y_train_transformed',
 'y_transformed'}

In [None]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8781,0.0,0.8781,0.878,0.8761,0.8379,0.8388,4.9667
svm,SVM - Linear Kernel,0.8701,0.0,0.8701,0.8737,0.8684,0.827,0.8292,5.5333
lr,Logistic Regression,0.8378,0.9807,0.8378,0.8448,0.8306,0.7809,0.7853,27.05
qda,Quadratic Discriminant Analysis,0.7529,0.8524,0.7529,0.6734,0.7005,0.6568,0.6712,19.1433
rf,Random Forest Classifier,0.6012,0.8801,0.6012,0.7055,0.5488,0.4279,0.4605,51.2733
knn,K Neighbors Classifier,0.5432,0.7988,0.5432,0.6291,0.5556,0.4142,0.4265,40.5967
nb,Naive Bayes,0.449,0.7345,0.449,0.5175,0.4566,0.2951,0.3007,4.98
dt,Decision Tree Classifier,0.4132,0.6161,0.4132,0.4127,0.4129,0.2285,0.2285,37.4567


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
# Ridge Classifier is the best baseline model