## START

In [None]:
import numpy as np
import pandas as pd

import re
import random
import os
import math

from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, confusion_matrix
# from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import balanced_accuracy_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_boston

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#  проверка на работу в "google colab" или "локально"

if 'sample_data' in os.listdir():
    project_dir = r'/content/drive/My Drive/Colab Notebooks/'
    print('Обнаружена среда выполнения Google Colab.')
    print('project_dir =>', project_dir)
else:
    project_dir = ''
    print('НЕ обнаружена среда выполнения Google Colab. Выбран режим локальной работы.')

In [None]:
# os.listdir(project_dir)
from sklearn.metrics import balanced_accuracy_score 
from sklearn.metrics import cohen_kappa_score

# Регрессия


In [None]:
myData = pd.read_csv(project_dir + 'u6/' + 'mycar.csv')
myData.shape

In [None]:
myData.head(9)

In [None]:

X = myData.iloc[:,:-1].values
Y = myData.iloc[:,1].values

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
myModel = LinearRegression()

#обучаем модель на обучающих данных
myModel.fit(X_train, Y_train)

In [None]:
y_pred = myModel.predict(X_test)
y_pred

### MAE, MSE, RMSE, MAPE

In [None]:
y_happy = [4,20,110,15,23]
y_happy_pred = [5,15,100,9,21]

# Вычисляем MAE: Mean Absolute Error

MAE = metrics.mean_absolute_error(y_happy, y_happy_pred)
print('MAE: ', MAE)

In [None]:
#Вычисляем MSE: Mean Squared Error. RMSE = Root MSE

MSE = metrics.mean_squared_error(y_happy, y_happy_pred)
print('MSE:', MSE)
print('RMSE:', np.sqrt(MSE))

In [None]:
# MAPE - Mean Absolute Percent Error

# Для ее вычисления модуль разницы между предсказанием алгоритма и истинным значением 
# мы делим на истинное значение. 
# Потом складываем все результаты (для каждого объекта), 
# делим на количество и умножаем на 100 %. 
# Итак, эта метрика показывает, на сколько процентов в среднем наше предсказание 
# отклоняется от реального значения.

In [None]:
# Вычисляем коэффициент детерминации (истина VS предсказания):

R_2 = metrics.r2_score(y_happy, y_happy_pred)
print('R_2:',R_2)

In [None]:
# Task 3.3.1

mean_squared_error([2, 3, -1, 4], [1, 3, 2, 5])


In [None]:
#  Task 3.4.5

R_2 = metrics.r2_score([2, 3, -1, 4], [1, 3, 2, 5])
round(R_2, 2)

## 3A.5 Линейная регрессия. Предобработка

In [None]:

data = pd.read_csv(project_dir + 'data_flats.csv', sep =';')
data.head()


In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
fig, ax = plt.subplots(figsize=(20,12))
sns_heatmap = sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
data.price_doc.hist()

In [None]:
data['price_doc'] = data['price_doc'].apply(lambda w: np.log(w + 1))


In [None]:
data.price_doc.hist()

In [None]:
sns.set(font_scale=1)
plt.subplots(figsize=(16, 16))
sns.heatmap(data.corr(), square=True, annot=True, fmt=".1f", linewidths=0.1, cmap="RdBu")


## Model - Tasks 3.5.1


In [None]:
df = data.copy()

In [None]:
df.columns

In [None]:
# X = df.drop(['id', 'price_doc', 'preschool_education_centers_raion', 'kindergarten_km', 'school_km', 'kremlin_km'], axis=1)
df.drop(['id', 'preschool_education_centers_raion', 'kindergarten_km', 'park_km', 'life_sq', 'kremlin_km'], axis=1, inplace=True)

df.dropna(inplace=True)

# 'price_doc'
Y = df['price_doc'].values
X = df.drop(['price_doc'], axis=1)

In [None]:
X.shape
# (23925, 21)

In [None]:
len(Y), len(X), len(X) - len(Y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state=77)

In [None]:
# Нормализуем обучающую и проверочную выборку по отдельности
scaler = RobustScaler()

X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.fit_transform(X_test)


In [None]:
%%time
# Обучаем модель на тестовом наборе данных

model = LinearRegression().fit(X_train_norm, Y_train)

In [None]:
# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
# y_pred = model.predict(X_test)

y_pred = model.predict(X_test_norm)

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются

# MAE = metrics.mean_absolute_error(np.exp(Y_test) - 1, np.exp(y_pred) - 1)

MSE = metrics.mean_squared_error(np.exp(Y_test) - 1, np.exp(y_pred) - 1)

# print('MAE:', round(MAE, 0))
print('MSE:', int(round(MSE, 0)))

# MSE: 16647931507747

### 3A.6. Линейная регрессия. Практика №1


In [None]:
# Задание 3.6.2

data = load_boston()

In [None]:
data['feature_names']

In [None]:
X, y = data['data'], data['target']

model = LinearRegression()
model.fit(X, y)


In [None]:
y_pred = model.predict(X)

In [None]:
RMSE = round( np.sqrt(mean_squared_error(y_pred, y)) , 2)
print('RMSE:', RMSE)

In [None]:
# print(data['DESCR'])
data['data'].shape

In [None]:
# Задание 3.6.3
# У какого из признаков наибольшее стандартное отклонение? Чему оно равно?

max_std_val = 0
name_column = ''

for item in range(data['feature_names'].shape[0]):
    # 13
    std_value = round(np.std(data['data'][:,item], ddof=0), 2)
    if std_value > max_std_val:
        max_std_val = std_value
        name_column = data['feature_names'][item]
    # print(data['feature_names'][item], ': ', std_value , sep='')

# print('-'*23)
print(name_column, ': ', max_std_val, sep='')

In [None]:
a = np.arange(60).reshape(12,5)
a

In [None]:
a[ : , 2]

In [None]:
filter = a[ : , 2] > 25

for i in range(len(a)):
    if filter[i]: print(a[ i , :])

In [None]:
a[filter, : ]

In [None]:
a * (a > 9)

In [None]:
row_count = len(data['data'])
row_count

In [None]:
data['data'].shape[1]

In [None]:
# data['feature_names'][-2]
data['feature_names'][11]

In [None]:
 a1 = np.empty([1, 2])

# a1.ndim, a1.shape

np.vstack((a1, [2, 3]))

In [None]:
# Задание 3.6.5
# Очистите данные от строк, где значение признака  меньше . Какой получился RMSE?

mask = data['data'][ : , 11] > 50
# mask.shape
y_short = []

X_new = np.empty([ 1, data['data'].shape[1] ])

for i in range(row_count):
    if mask[i]:
        new_row = data['data'][ i, : ]
        X_new = np.row_stack((X_new, new_row))
        # np.append(y_short, data['target'][i])
        # y_short = np.row_stack((y_short, data['target'][i]))
        y_short.append(data['target'][i])

y_short = np.array(y_short)
X_new = X_new[1: ]


X_new.shape, y_short.shape


In [None]:
model = LinearRegression()
model.fit(X_new, y_short)
y_pred = model.predict(X_new)

MSE = metrics.mean_squared_error(y_short, y_pred)
print('MSE:', round(MSE, 2))
print('RMSE:', round(np.sqrt(MSE), 2))

# Классификация

## F-Мера

In [None]:
breast_cancer = load_breast_cancer()
type(breast_cancer)

In [None]:
Y = breast_cancer.target ## Наша целевая переменная, 0 — если рака нет, 1 — если есть 
X = breast_cancer.data # X - признаки, по которым мы будем предсказывать рак 

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.3)


In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:

Y_predicted = model.predict(X_val)

In [None]:
print(accuracy_score(Y_val, Y_predicted))
print(precision_score(Y_val, Y_predicted))
print(recall_score(Y_val, Y_predicted))
print(f1_score(Y_val, Y_predicted))

### Задание 3B2.2

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
Y = iris.target
X = iris.data

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.3, random_state=50)

In [None]:
model = LogisticRegression()
model.fit(X_train, Y_train)

Y_predicted = model.predict(X_val)

In [None]:
print(accuracy_score(Y_val, Y_predicted))

### 3B.3. Классификация. Практика

In [None]:
os.listdir(project_dir + 'u6')

In [None]:
tmob = pd.read_csv(project_dir + 'u6/' + 'train_mobile.csv', sep =';')
tmob.head()

In [None]:
tmob.columns

In [None]:
sns.set(font_scale=1)
plt.subplots(figsize=(16, 16))
sns.heatmap(tmob.corr(), square=True, annot=True, fmt=".2f", linewidths=0.1, cmap="RdBu")

In [None]:
df_corr = tmob.corr()
sns.heatmap(df_corr)

In [None]:
df_corr['price_range'].sort_values(ascending=False).head(5).tail(4)

In [None]:
df_corr['price_range'].sort_values().head(1)

In [None]:
X = tmob[['ram', 'battery_power', 'px_width', 'px_height', 'touch_screen']]
Y = tmob[['price_range']]


In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, random_state=31)
model = LogisticRegression()

model.fit(X_train, Y_train)

Y_predicted = model.predict(X_val)

In [None]:
round(precision_score(Y_val, Y_predicted), 4)

### 3B.6. Логистическая регрессия. Практика

In [None]:
adult = pd.read_csv(project_dir + 'u6/' + 'adult.data',
                    names=['age', 'workclass', 'fnlwgt', 'education',
                           'education-num', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'capital-gain',
                           'capital-loss', 'hours-per-week', 'native-country', 'salary'])

In [None]:
# Избавиться от лишних признаков
adult.drop(['native-country'], axis=1, inplace=True)

# Сконвертировать целевой столбец в бинарные значения
adult['salary'] = (adult['salary'] != ' <=50K').astype('int32')


In [None]:
# Нормализовать нуждающиеся в этом признаки
a_features = adult[['age', 'education-num', 'hours-per-week', 'fnlwgt', 'capital-gain', 'capital-loss']].values
norm_features = (a_features - a_features.mean(axis=0)) / a_features.std(axis=0)

adult.loc[:, ['age', 'education-num', 'hours-per-week', 'fnlwgt', 'capital-gain', 'capital-loss']] = norm_features


In [None]:
# Сделать one-hot encoding для некоторых признаков
adult = pd.get_dummies(adult, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex'])



In [None]:
adult.head()

In [None]:
Y = adult['salary'].values

new_columns = list(set(adult.columns) - set(['salary']))
X = adult[new_columns].values

# Добавить фиктивный столбец единиц (bias линейной модели)
X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])


In [None]:
model = LogisticRegression()

model.fit(X, Y)

Y_predicted = model.predict(X)

# round(f1_score(Y, Y_predicted), 2)

In [None]:
# print(accuracy_score(Y, Y_predicted))
# print(precision_score(Y, Y_predicted))
# print(recall_score(Y, Y_predicted))

round(f1_score(Y, Y_predicted), 2)


In [None]:
# Задание 3B.6.2

confusion_matrix(Y, Y_predicted)

In [None]:
# Отрисовать ROC кривую
def calc_and_plot_roc(y_true, y_pred_proba):
    # Посчитать значения ROC кривой и значение площади под кривой AUC
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.title('Receiver Operating Characteristic', fontsize=14)
    plt.xlabel('False positive rate (FPR)', fontsize=14)
    plt.ylabel('True positive rate (TPR)', fontsize=14)
    plt.legend(fontsize=14)

In [None]:
# Задание 3B.6.3
# roc_curve(Y, Y_predicted, pos_label=2)
# roc_curve(Y, Y_predicted)

calc_and_plot_roc(Y, Y_predicted)


In [None]:
# Задание 3B.6.4
# Постройте модель логистической регрессии при помощи sklearn без регуляризации. Чему равен  f1-score


Y = adult['salary'].values

new_columns = list(set(adult.columns) - set(['salary']))
X = adult[new_columns].values

# Добавить фиктивный столбец единиц (bias линейной модели)
# X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])


model = LogisticRegression(penalty='none')
model.fit(X, Y)

Y_predicted = model.predict(X)
# round(f1_score(Y, Y_predicted), 2)

In [None]:
round(f1_score(Y, Y_predicted), 2)

In [None]:
result = dict()
f1_max = 0
l2_reg_answer = 0

for i in range(1, 101):
    l2_reg= i/100
    model = LogisticRegression(penalty='l2', C=l2_reg)
    model.fit(X, Y)
    Y_predicted = model.predict(X)
    f1 = f1_score(Y, Y_predicted)
    result[l2_reg] = f1
    if f1 > f1_max:
        f1_max = f1
        l2_reg_answer = l2_reg


In [None]:
l2_reg_answer, round(f1_max, 2)
# 0.66

In [None]:
# Задание 3B.6.6
# Замените в столбце native-country страны, у которых меньше ста записей,

adult = pd.read_csv(project_dir + 'u6/' + 'adult.data',
                    names=['age', 'workclass', 'fnlwgt', 'education',
                           'education-num', 'marital-status', 'occupation',
                           'relationship', 'race', 'sex', 'capital-gain',
                           'capital-loss', 'hours-per-week', 'native-country', 'salary'])

# Сконвертировать целевой столбец в бинарные значения
adult['salary'] = (adult['salary'] != ' <=50K').astype('int32')

adult.shape

In [None]:
country = adult['native-country'].value_counts()
# country
replace_country = country[country < 100].index

adult['native-country'] = adult['native-country'].apply(lambda x: 'other' if x in replace_country else x)
adult['native-country'].value_counts()

In [None]:
# Нормализовать нуждающиеся в этом признаки
a_features = adult[['age', 'education-num', 'hours-per-week', 'fnlwgt', 'capital-gain', 'capital-loss']].values
norm_features = (a_features - a_features.mean(axis=0)) / a_features.std(axis=0)

adult.loc[:, ['age', 'education-num', 'hours-per-week', 'fnlwgt', 'capital-gain', 'capital-loss']] = norm_features


In [None]:
# Сделать one-hot encoding для некоторых признаков
adult = pd.get_dummies(adult, columns=['native-country', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex'])
adult.shape

In [None]:
Y = adult['salary'].values

new_columns = list(set(adult.columns) - set(['salary']))
X = adult[new_columns].values

# Добавить фиктивный столбец единиц (bias линейной модели)
# X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])

In [None]:
model = LogisticRegression().fit(X, Y)

Y_predicted = model.predict(X)

round(f1_score(Y, Y_predicted), 2)

In [None]:
round(f1_score(Y, Y_predicted), 2)

## Валидация данных и оценка модели

In [None]:
from sklearn.model_selection import KFold 

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) 
y = np.array([1, 2, 3, 4]) 

kf = KFold(n_splits = 2 )  #реализация разбиения

In [None]:
kf.get_n_splits(X) #возвращает количество разбиений
kf.split(X) #возвращает индексы для разбиения

In [None]:
# Задание 4.3.1
# from sklearn.model_selection import train_test_split
file_patch = project_dir + 'u6/' + 'train.csv'
# os.listdir(project_dir + 'u6/')
vis_data = pd.read_csv(file_patch, encoding = 'ISO-8859-1', low_memory = False)

# сдать этот код
X_train, X_test = train_test_split(vis_data, shuffle=False, train_size=0.7)
result = X_test['payment_amount'].mean()

In [None]:
vis_data.shape

In [None]:
# Задание 4.4.5
y_true = [1.23, 2.35, 2.75]
y_pred = [1.01, 12.3, 2.74]

MSE = metrics.mean_squared_error(y_true, y_pred)
print('MSE:', round(MSE, 2))
print('RMSE:', round(np.sqrt(MSE), 2))

In [None]:
# Задание 4.5.1

vdata = pd.read_csv(project_dir + 'u6' + '/train.csv', encoding = 'ISO-8859-1', low_memory = False)
vdata.shape

In [None]:
vdata=vdata[['fine_amount', 'state_fee', 'late_fee', 'discount_amount', 'balance_due']]
vdata.shape

In [None]:
vdata.columns

In [None]:
vdata.dropna(inplace=True)
vdata

In [None]:
vdata.shape

In [None]:
Y = vdata['balance_due'].values
X = vdata[list(set(vdata.columns) - set(['balance_due']))].values

X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=False, train_size=0.7)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = LinearRegression()

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
MSE = mean_squared_error(y_test, y_pred)
print('MSE:', MSE)
result = np.sqrt(MSE)

print('RMSE:', result)

`Задание 4.5.1`

In [None]:
vdata = pd.read_csv(project_dir + 'module_4/data/u6' + '/train.csv', encoding = 'ISO-8859-1', low_memory = False)
vdata.shape

In [None]:

vdata=vdata[['fine_amount', 'state_fee', 'late_fee', 'discount_amount', 'balance_due']]

vdata.dropna(inplace=True)

Y = vdata['balance_due'].values
X = vdata[list(set(vdata.columns) - set(['balance_due']))].values


X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=False, train_size=0.7)

model = LinearRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

MSE = mean_squared_error(y_test, y_pred)

result = MSE**0.5

## 4.6 Accuracy, Precision и Recall, F1-score


In [None]:
a = np.zeros(100) + 55
b = np.zeros(100) + 55

b[1] = 10

round(accuracy_score(b, a), 2)

In [None]:
# Задание 4.6.7

precision = 0.75
recall = 0.6

f1 = 2 * precision * recall / (precision + recall)

round(f1, 2)

In [None]:
# Задание 4.6.8

y_true = [0, 0, 1, 1, 1, 1, 0, 1]
y_pred = [0, 1, 0, 0, 1, 1, 0, 1]

round(f1_score(y_true, y_pred), 2)

In [None]:
# Допустим, у вас есть выборки 
y_true = [0, 0, 1, 0, 0, 1, 0]
y_pred = [1, 1, 1, 0, 1, 1, 0]

round(precision_score(y_true, y_pred), 2)

In [None]:
# Допустим, у вас есть выборки 
y_true = [0, 0, 1, 0, 0, 1, 0] 
y_pred = [1, 1, 1, 0, 1, 1, 0]

round(recall_score(y_true, y_pred), 2)

## 4.7 практика

`Задание`

Попробуйте построить модель, предсказывающую пол обладателя записи голоса.

* meanfreq: средняя частота голоса (в кГц)
* sd: стандартное отклонение частоты голоса
* median: медианная частота (в кГц)
* Q25: значение в первом квартиле (в кГц)
* Q75: значение в третьем квартиле (в кГц)
* IQR: интерквартильный размах (в кГц)
* skew: ассиметрия
* kurt: эксцесс
* sp.ent: спектральная энтропия
* sfm: энтропия Винера
* mode: мода частоты
* centroid: частотный центроид
* meanfun: средняя основная частота, измеренная по акустическому сигналу
* minfun:  минимальная основная частота, измеренная по акустическому сигналу
* maxfun: максимальная основная частота, измеренная в акустическом сигнале
* meandom: среднее значение доминирующей частоты, измеренной по акустическому сигналу
* mindom: минимум доминирующей частоты, измеренной в акустическом сигнале
* maxdom: максимум доминирующей частоты, измеренной в акустическом сигнале
* dfrange: диапазон доминантных частот, измеренное на звуковой сигнал
* modindx: индекс модуляции голоса

In [None]:
os.listdir(project_dir + '/module_4/data/u6')

In [None]:
# Задание 4.7.1

voice = pd.read_csv(project_dir + 'module_4/data/u6/' + 'voiceDataSet.csv')
voice.shape

In [None]:
voice.columns

In [None]:
voice.sample(9)

In [None]:

y = voice['label'].values

new_columns = list( set(voice.columns) - set(['label']) )

x = voice[new_columns].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)


In [None]:
scaler = StandardScaler()
scaler.fit(x)

x_train_norm = scaler.transform(x_train)
x_test_norm = scaler.transform(x_test)

In [None]:
model = LogisticRegression()
model.fit(x_train_norm, y_train)

In [None]:
y_pred = model.predict(x_test_norm)

In [None]:
round(accuracy_score(y_test, y_pred), 3)
# round(precision_score(Y_val, Y_predicted), 4)

In [None]:
# target_column = ['_']

# y = df[target_column].values

# new_columns = list( set(df.columns) - set(target_column) )

# x = df[new_columns].values

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)


# # scaler = StandardScaler()
# # scaler.fit(x)
# # x_train_norm = scaler.transform(x_train)
# # x_test_norm = scaler.transform(x_test)

# model = LogisticRegression()
# model.fit(x_train_norm, y_train)

# y_pred = model.predict(x_test_norm)

`Задание 4.7.2`

*В этом кейсе мы попробуем определять типы стекла по его характеристикам.*




In [None]:
df = pd.read_csv(project_dir + 'module_4/data/u6/' + 'glass.csv')
df.shape, df.columns

In [None]:
df['Type'].value_counts()


Алгоритм `kNN` (метод ближайших соседей)

---
Важно! Количество соседей должно быть нечетное во избежание спорной ситуации.


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

`Задание 4.7.3`

In [None]:
target_column = ['Type']

y = df[target_column].values

new_columns = list( set(df.columns) - set(target_column) )

x = df[new_columns].values


model=KNeighborsClassifier(n_neighbors=3)
kf = KFold(n_splits=10)

accuracy = cross_val_score(model, x, y, cv=kf, scoring="accuracy")

In [None]:
round(np.mean(accuracy), 2)

`Задание 4.7.4`

*В следующей задаче мы будем диагностировать болезни сердца по различным медицинским параметрам пациентов.* 

In [None]:
# os.listdir(project_dir + 'module_4/data/u6/')
heart_disease = pd.read_csv(project_dir + 'module_4/data/u6/' + 'heart_fin1.csv', sep=';')
heart_disease.shape, heart_disease.columns

In [None]:
heart_disease.sample(9)

In [None]:
df = heart_disease.copy()

new_columns = list( set(df.columns) - set(target_column) )

for column_name in new_columns:
    perc25 = df[column_name].describe().loc['25%']
    perc75 = df[column_name].describe().loc['75%']
    iqr = perc75 - perc25
    range_left = perc25 - 1.5 * iqr
    range_right = perc75 + 1.5 * iqr
    filter = str(range_left) + ' < ' + str(column_name) + ' < ' + str(range_right)
    df = df.query(filter)


In [None]:
df = heart_disease.copy()

target_column = ['target']
new_columns = list( set(df.columns) - set(target_column) )

drop_index = set()

for column_name in new_columns:
    perc25 = df[column_name].describe().loc['25%']
    perc75 = df[column_name].describe().loc['75%']
    iqr = perc75 - perc25
    range_left = perc25 - 1.5 * iqr
    range_right = perc75 + 1.5 * iqr
    filter = str(range_left) + ' <= ' + str(column_name) + ' <= ' + str(range_right)
    current_index = set(df.index) - set(df.query(filter).index)
    drop_index = drop_index | current_index 
    # print(column_name, ':', current_index)

# len(drop_index)
df = df.loc[set(df.index) - drop_index]

In [None]:
df.shape

In [None]:
y = df[target_column].values


x = df[new_columns].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)


model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [None]:
round(roc_auc_score(y_test, y_pred), 2)

In [None]:
model2 = KNeighborsClassifier(n_neighbors=3)

model2.fit(x_train, y_train)

y_pred = model2.predict(x_test)


In [None]:
round(roc_auc_score(y_test, y_pred), 2)

`Задание 4.9.1`

`DecisionTreeClassifier` из scikit-learn


In [None]:
vis_data = pd.read_csv(project_dir + 'module_4/data/u6' + '/train.csv', encoding = 'ISO-8859-1', low_memory = False)
vis_data.shape


In [None]:
df = vis_data[['fine_amount', 'state_fee', 'late_fee', 'discount_amount', 'balance_due', 'compliance']]
# df.shape, df.dropna(axis=0).shape
# df.columns
df = df.dropna(axis=0)
# df.shape

X, y = df.iloc[ : , 0:-1], df.iloc[ : , -1]
# X.shape, y.shape

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=False)

clf = DecisionTreeClassifier(random_state=23)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_train)
f1_train = f1_score(y_train, y_pred)

y_pred = clf.predict(x_test)
f1_val = f1_score(y_test, y_pred)


print('[train] F1-score = {:.2f}'.format(f1_train))
print('[valid] F1-score = {:.2f}'.format(f1_val))

result = f1_train - f1_val

print(round(result, 2))

## 4.10 Дисбаланс выборки

`Задание 4.11.1`

In [None]:
df = vis_data[['fine_amount', 'state_fee', 'late_fee', 'discount_amount', 'balance_due', 'compliance']]
df = df.dropna(axis=0)

X, y = df.iloc[ : , 0:-1], df.iloc[ : , -1]

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=False)

clf = DecisionTreeClassifier(random_state=23)
clf.fit(x_train, y_train)


f1_train = f1_score(y_train, clf.predict(x_train))
f1_val = f1_score(y_test, clf.predict(x_test))

print('-'*10, 'NO BALANSE', '-'*20)
print('[train] F1-score = {:.2f}'.format(f1_train))
print('[valid] F1-score = {:.2f}'.format(f1_val))
f1_a = f1_train

# делайте эту же выборку сбалансированной с помощью undersampling

compliance = df['compliance'].value_counts()
min_class = compliance.idxmin()
max_class = compliance.idxmax()
n = compliance.loc[min_class]

df_big = df[df['compliance'] == max_class].head(n)
df_small = df[df['compliance'] == min_class]

df_big.shape, df_small.shape
# df_balance = df_big.append(df_small)
df_balance = df_big.append(df_small, sort=False)


X, y = df_balance.iloc[ : , 0:-1], df_balance.iloc[ : , -1]

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=False)

clf = DecisionTreeClassifier(random_state=23)
clf.fit(x_train, y_train)


f1_train = f1_score(y_train, clf.predict(x_train))
f1_val = f1_score(y_test, clf.predict(x_test))

print('-'*10, 'BALANSE', '-'*20)
print('[train] F1-score = {:.2f}'.format(f1_train))
print('[valid] F1-score = {:.2f}'.format(f1_val))
f1_b = f1_train

result = f1_b - f1_a

print(round(result, 2))

In [None]:
plt.hist(df['compliance'], bins=20)


In [None]:
sns.countplot(x='compliance', data=df)

In [None]:
# df['compliance'].value_counts()
# df['compliance'].value_counts().idxmin()
# df['compliance'].value_counts().min()
# df['compliance'].value_counts().values

# END