## lesson-6

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve

import matplotlib.pyplot as plt

%matplotlib inline

### 1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

Data Set: https://archive.ics.uci.edu/ml/datasets/Abalone

In [2]:
df = pd.read_csv("abalone.csv")
df.head(3)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9


In [3]:
df.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
Rings               int64
dtype: object

Пропуски

In [4]:
df.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

Целевая переменная

In [5]:
df['Rings'].value_counts()

9     689
10    634
8     568
11    487
7     391
12    267
6     259
13    203
14    126
5     115
15    103
16     67
17     58
4      57
18     42
19     32
20     26
3      15
21     14
23      9
22      6
24      2
27      2
1       1
25      1
2       1
26      1
29      1
Name: Rings, dtype: int64

Sex

In [6]:
df['Sex'].value_counts()

M    1528
I    1342
F    1307
Name: Sex, dtype: int64

Разбиение индексов на train и test

In [7]:
indices_train, indices_test = train_test_split(df.index, test_size=0.3, random_state=42)

### 2. сделать feature engineering

Sex

In [8]:
df.loc[df['Sex'] == "I", 'Sex'] = 0
df.loc[df['Sex'] == "F", 'Sex'] = 1
df.loc[df['Sex'] == "M", 'Sex'] = 2

In [9]:
df['Sex'] = df['Sex'].astype(int)

Size

In [10]:
df['Size'] = df['Length'] * df['Diameter'] * df['Height']

Target encoding 

mean_Rings_by_Sex

In [11]:
train_cp = df.loc[indices_train, :].copy()
mean_Rings_by_Sex = train_cp.groupby(['Sex'], as_index=False).agg({'Rings':'mean'})\
                                            .rename(columns={'Rings':'mean_Rings_by_Sex'}) 

df = df.merge(mean_Rings_by_Sex, on=['Sex'], how='left')

Сведём задачу к бинарной классификации

In [12]:
df.loc[(df['Rings'] != 9) & (df['Rings'] != 10) & (df['Rings'] != 11), 'Rings'] = 0
df.loc[(df['Rings'] == 9) | (df['Rings'] == 10) | (df['Rings'] == 11), 'Rings'] = 1

In [13]:
df['Rings'].value_counts()

0    2367
1    1810
Name: Rings, dtype: int64

In [14]:
df.head(3)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Size,mean_Rings_by_Sex
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0.015777,10.713084
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0.008347,10.713084
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0.030051,11.125806


### 3. обучить любой классификатор (какой вам нравится)

In [15]:
df.columns.tolist()

['Sex',
 'Length',
 'Diameter',
 'Height',
 'Whole weight',
 'Shucked weight',
 'Viscera weight',
 'Shell weight',
 'Rings',
 'Size',
 'mean_Rings_by_Sex']

In [16]:
feat = ['Sex',
 'Length',
 'Diameter',
 'Height',
 'Whole weight',
 'Shucked weight',
 'Viscera weight',
 'Shell weight',
 'Size']

target = 'Rings'

In [17]:
X_train = df.loc[indices_train, feat]
y_train = df.loc[indices_train, target]

X_test = df.loc[indices_test, feat]
y_test = df.loc[indices_test, target]

In [18]:
disbalance_gb = y_train.value_counts()[0] / y_train.value_counts()[1]
disbalance_gb

1.3015748031496064

In [19]:
model_gb = GradientBoostingClassifier(random_state = 42)

In [20]:
model_gb.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [21]:
preds_gb = model_gb.predict_proba(X_test)[:, 1]
preds_gb[:10]

array([0.37068066, 0.57662525, 0.15375099, 0.7291095 , 0.34231772,
       0.62198708, 0.45601693, 0.44432699, 0.10114954, 0.66917378])

In [22]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_gb)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
roc_auc_gb = roc_auc_score(y_test, preds_gb)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, Roc_Auc=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix],
                                                                        roc_auc_gb))
thresholds_gb = thresholds[ix]
fscore_gb = fscore[ix]
precision_gb = precision[ix]
recall_gb = recall[ix]

Best Threshold=0.289307, F-Score=0.698, Precision=0.566, Recall=0.911, Roc_Auc=0.774


### 4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [23]:
Rings_1_shuffle = (df.loc[indices_train, :].loc[df['Rings'] == 1, :]).sample(frac=1, random_state=42)
len_80_Rings_1_shuffle = int(0.8 * len(Rings_1_shuffle))

P = Rings_1_shuffle[:len_80_Rings_1_shuffle]
U = df.loc[indices_train, :].loc[df['Rings'] == 0, :].append(Rings_1_shuffle[len_80_Rings_1_shuffle:])
U['Rings'] = 0

### 5. применить random negative sampling для построения классификатора в новых условиях

In [24]:
U_sample = U.sample(frac=0.6, random_state=42)

In [25]:
train = U_sample.append(P)

In [26]:
X_train = train[feat]
y_train = train[target]

In [27]:
disbalance_rns = len(U_sample) / len(P)
disbalance_rns

1.125984251968504

In [28]:
model_rns = GradientBoostingClassifier(random_state = 42)

In [29]:
model_rns.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [30]:
preds_rns = model_rns.predict_proba(X_test)[:, 1]
preds_rns[:10]

array([0.43819367, 0.62416189, 0.19897638, 0.67349061, 0.31867421,
       0.33880488, 0.39829657, 0.47392877, 0.09010274, 0.68871968])

In [31]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_rns)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
roc_auc_rns = roc_auc_score(y_test, preds_rns)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, Roc_Auc=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix],
                                                                        roc_auc_rns))
thresholds_rns = thresholds[ix]
fscore_rns = fscore[ix]
precision_rns = precision[ix]
recall_rns = recall[ix]

Best Threshold=0.412243, F-Score=0.700, Precision=0.599, Recall=0.841, Roc_Auc=0.760


### 6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [32]:
united_table = pd.DataFrame({'models': ['GradientBoostingClassifier', 'RandomNegativeSampling'], 
                             'precision': [precision_gb, precision_rns], 
                             'recall': [recall_gb, recall_rns], 
                             'roc_auc': [roc_auc_gb, roc_auc_rns],
                             'f_score': [fscore_gb, fscore_rns]})
united_table

Unnamed: 0,models,precision,recall,roc_auc,f_score
0,GradientBoostingClassifier,0.566168,0.911111,0.774275,0.698368
1,RandomNegativeSampling,0.598945,0.840741,0.760386,0.699538


#### Вывод:

Метрика f1_score у RandomNegativeSampling оказалась даже больше, хотя мы знали всего лишь 80% позитивных значений, это связано с тем, что из-за сэмплинга модмножества U, дизбаланс классов уменьшился.

In [33]:
disbalance = {'GradientBoostingClassifier': disbalance_gb, 'RandomNegativeSampling': disbalance_rns}
disbalance

{'GradientBoostingClassifier': 1.3015748031496064,
 'RandomNegativeSampling': 1.125984251968504}

### 7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

70% P

In [34]:
Rings_1_shuffle = (df.loc[indices_train, :].loc[df['Rings'] == 1, :]).sample(frac=1, random_state=42)
len_70_Rings_1_shuffle = int(0.7 * len(Rings_1_shuffle))

P = Rings_1_shuffle[:len_70_Rings_1_shuffle]
U = df.loc[indices_train, :].loc[df['Rings'] == 0, :].append(Rings_1_shuffle[len_70_Rings_1_shuffle:])
U['Rings'] = 0

In [35]:
U_sample = U.sample(frac=0.6, random_state=42)

In [36]:
train = U_sample.append(P)

In [37]:
X_train = train[feat]
y_train = train[target]

In [38]:
disbalance_rns_70 = len(U_sample) / len(P)
disbalance_rns_70

1.3723284589426321

In [39]:
model_rns_70 = GradientBoostingClassifier(random_state = 42)

In [40]:
model_rns_70.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [41]:
preds_rns_70 = model_rns_70.predict_proba(X_test)[:, 1]
preds_rns_70[:10]

array([0.40369051, 0.54895803, 0.2142196 , 0.647523  , 0.35385419,
       0.3025442 , 0.3435568 , 0.41725318, 0.06573527, 0.57748758])

In [42]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_rns_70)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
roc_auc_rns_70 = roc_auc_score(y_test, preds_rns_70)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, Roc_Auc=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix],
                                                                        roc_auc_rns_70))
thresholds_rns_70 = thresholds[ix]
fscore_rns_70 = fscore[ix]
precision_rns_70 = precision[ix]
recall_rns_70 = recall[ix]

Best Threshold=0.298930, F-Score=0.692, Precision=0.564, Recall=0.893, Roc_Auc=0.761


90% P

In [43]:
Rings_1_shuffle = (df.loc[indices_train, :].loc[df['Rings'] == 1, :]).sample(frac=1, random_state=42)
len_90_Rings_1_shuffle = int(0.9 * len(Rings_1_shuffle))

P = Rings_1_shuffle[:len_90_Rings_1_shuffle]
U = df.loc[indices_train, :].loc[df['Rings'] == 0, :].append(Rings_1_shuffle[len_90_Rings_1_shuffle:])
U['Rings'] = 0

In [44]:
U_sample = U.sample(frac=0.7, random_state=42)

In [45]:
train = U_sample.append(P)

In [46]:
X_train = train[feat]
y_train = train[target]

In [47]:
disbalance_rns_90 = len(U_sample) / len(P)
disbalance_rns_90

1.090113735783027

In [48]:
model_rns_90 = GradientBoostingClassifier(random_state = 42)

In [49]:
model_rns_90.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [50]:
preds_rns_90 = model_rns_90.predict_proba(X_test)[:, 1]
preds_rns_90[:10]

array([0.4242089 , 0.62871756, 0.23066264, 0.75132214, 0.37802824,
       0.39433547, 0.44303951, 0.5174571 , 0.08680314, 0.73975862])

In [51]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_rns_90)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
roc_auc_rns_90 = roc_auc_score(y_test, preds_rns_90)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, Roc_Auc=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix],
                                                                        roc_auc_rns_90))
thresholds_rns_90 = thresholds[ix]
fscore_rns_90 = fscore[ix]
precision_rns_90 = precision[ix]
recall_rns_90 = recall[ix]

Best Threshold=0.383624, F-Score=0.703, Precision=0.587, Recall=0.876, Roc_Auc=0.767


#### Вывод:

В среднем, чем меньше P, то есть чем меньше мы знаем положительных значений, тем меньше метрика и наоборот, но здесь нужно так же следить за дизбалансом данных, в идеале он должен быть в райное 1. Его можно регулировать изменяя обьём семплированной выборки из U.