In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Домашнее задание № 1

Вам необходимо выбор стратегии валидации и обработку данных от ошибок заполнения.

## Фильтрация по диапазону значений 

In [17]:
data = pd.read_csv('train_features_with_answers.csv')

In [18]:
unique_vals = dict()
for col in data:
    unique_vals[col] = data[col].unique() 
    print(f"{col}:", unique_vals[col])

school: ['MS' 'GP']
sex: ['M' 'F' 'D' 'C' 'B' 'A']
age: [ nan  15.  17.  20.  18.  16.  19. 161. 181. 151. 116.  21.  22.  -1.
   1.   5.   8.]
address: ['U' 'R' nan]
famsize: ['LE3' 'GT3']
Pstatus: ['T' 'A']
Medu: [1 3 4 2 0]
Fedu: [3 4 1 2 0]
Mjob: ['at_home' 'teacher' 'other' 'services' 'health']
Fjob: ['services' 'other' 'at_home' 'teacher' 'health']
reason: ['course' 'home' 'reputation' 'other']
guardian: ['mother' 'father' 'other']
traveltime: [1 3 2 4]
studytime: [1 2 3 4]
failures: [0 1 2 3]
schoolsup: ['no' 'yes']
famsup: ['no' 'yes']
paid: ['no' 'yes']
activities: ['no' 'yes']
nursery: ['yes' 'no']
higher: ['no' 'yes']
internet: ['yes' 'no']
romantic: ['yes' 'no']
famrel: [4 3 5 1 2]
freetime: [3 5 2 1 4]
goout: [3 4 1 5 2]
Dalc: [2 3 1 5 4]
Walc: [3 5 1 2 4]
health: [3 5 2 1 4]
absences: [ 0 16  4  8  2  1  9  6  5 11  3 10 12 14 18 15 24 22 32 21 13  7]
G3: [ 9 12 13 11 14 16 10 17  8 15  0  6  7 19 18  1  5]


In [19]:
data.loc[(data["age"] < 15) | ( data["age"] > 22), "age"] = np.nan
data.loc[(data["sex"] != "M") & (data["sex"] != "F"), "sex"] = np.nan

## Адаптивное заполнения пропусков 

### Разбиение данных

In [20]:
data.loc[data["sex"] == 'F', "sex"] = 0
data.loc[data["sex"] == 'M', "sex"] = 1

data.loc[data["address"] == 'U', "address"] = 0
data.loc[data["address"] == 'R', "address"] = 1

data["sex"] = pd.to_numeric(data["sex"])
data["address"] = pd.to_numeric(data["address"])

In [21]:
cat_columns = []

for name in data.columns:
    if name not in data._get_numeric_data().columns:
        cat_columns += [name]
data = pd.get_dummies(data, columns=cat_columns)

In [22]:
data_without_na = data.dropna()
X = data_without_na.drop(["sex", "age", "address", "G3"], axis=1)

y = dict()
y["sex"] = data_without_na["sex"]
y["age"]  = data_without_na["age"]
y["address"]  = data_without_na["address"]

X_train_d = dict()
y_train_d = dict()

X_test_d = dict()
y_test_d = dict()

In [23]:
for col in ["sex", "age", "address"]:
    X_train, X_test, y_train, y_test = train_test_split(X, y[col], test_size=0.33, random_state=42)
    X_train_d[col] = X_train
    X_test_d[col] = X_test
    y_train_d[col] = y_train
    y_test_d[col] = y_test

In [24]:
data_prep = dict()
data_prep["train"] = [X_train_d, y_train_d]
data_prep["test"] =[X_test_d, y_test_d]

In [25]:
classifiers = dict()

for stage in ["train", "test"]:
    if stage == "train":
        for col in ["sex", "age", "address"]:
            cls = LogisticRegression(max_iter = 1000)
            cls.fit(data_prep[stage][0][col], data_prep[stage][1][col])
            classifiers[col] = cls
    else:
        for col in ["sex", "age", "address"]:
            cls = classifiers[col]
            prediction = cls.predict(data_prep[stage][0][col])
            score = accuracy_score(data_prep[stage][1][col], prediction)
            print(col.upper(), round(score, 3))

SEX 0.679
AGE 0.269
ADDRESS 0.709


### Заполнение пропусков

In [26]:
data_nans = data[data.isna().any(axis=1)]
X_nans = data_nans.drop(["sex", "age", "address", "G3"], axis=1)

X_nans_sex = X_nans[data_nans["sex"].isna()]
X_nans_age = X_nans[data_nans["age"].isna()]
X_nans_add = X_nans[data_nans["address"].isna()]


predictions = dict()
predictions["sex"] = classifiers["sex"].predict(X_nans_sex)
predictions["age"] = classifiers["age"].predict(X_nans_age)
predictions["address"] = classifiers["address"].predict(X_nans_add)

In [27]:
for col in ["sex", "age", "address"]:
    for i, (index, row) in enumerate(data[data[col].isna()].iterrows()):
        data.loc[index, col] = predictions[col][i]
    

In [28]:
unique_vals = dict()
for col in ["sex", "age", "address"]:
    unique_vals[col] = data[col].unique() 
    print(f"{col}:", unique_vals[col])

sex: [1. 0.]
age: [18. 15. 17. 20. 16. 19. 21. 22.]
address: [0. 1.]


Ez, получилось прикольным образом заполнить пропуски в данных

## Стратегия валидации

Итак, у вас есть данные для обучения и данные, на которых необходимо сделать предсказание. Для оценки точности необходимо проводить валидацию (так как мало данных, можно проводить кросс валидацию).
Валидацию проще всего реализовать методами sklearn, например:
- KFold
- ShuffleSplit
- StratifiedKFold
- StratifiedShuffleSplit
- GroupKFold
- GroupShuffleSplit

Так как классы не сбалансированны, буду испольщовать `Stratified k-fold`