In [194]:
import joblib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
    LabelBinarizer, 
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
)

### Загрузка датасета:

In [195]:
df = pd.read_csv("ТИУ Бакалавриат Абитуриенты.csv")
df = df.drop("Unnamed: 0", axis=1)
df.head(5)

Unnamed: 0,Год приёмной компании,Пол,Ср. балл док-та об образовании,Вид приема,Приоритет,Направление подготовки,Сумма баллов,Сумма баллов за индивидуальные достижения,Приказ о зачислении
0,2019,М,4.563,По договору,2.0,38.03.06 Торговое дело,234.0,10.0,0
1,2019,М,4.563,По договору,3.0,38.03.07 Товароведение,234.0,10.0,0
2,2019,М,4.563,По договору,1.0,38.03.06 Торговое дело,234.0,10.0,0
3,2019,М,3.733,Общий конкурс,3.0,27.03.04 Управление в технических системах,173.0,0.0,0
4,2019,М,3.733,Общий конкурс,2.0,15.03.04 Автоматизация технологических процесс...,173.0,0.0,0


### Подготовка и отчистка данных:

In [196]:
df["Вид приема"].value_counts()

Вид приема
Общий конкурс                               83770
По договору                                 65335
В рамках квоты лиц, имеющих особые права     1353
Целевой прием                                 495
Отдельная квота                               120
Без ВИ в рамках КЦП                             6
Name: count, dtype: int64

Удаление абитуриентов которые поступали не в рамках общего конкурса:

In [197]:
df = df[df["Вид приема"] == "Общий конкурс"]
df["Вид приема"].value_counts()

Вид приема
Общий конкурс    83770
Name: count, dtype: int64

In [198]:
df = df.drop("Вид приема", axis=1)
df.head(3)

Unnamed: 0,Год приёмной компании,Пол,Ср. балл док-та об образовании,Приоритет,Направление подготовки,Сумма баллов,Сумма баллов за индивидуальные достижения,Приказ о зачислении
3,2019,М,3.733,3.0,27.03.04 Управление в технических системах,173.0,0.0,0
4,2019,М,3.733,2.0,15.03.04 Автоматизация технологических процесс...,173.0,0.0,0
7,2019,М,3.733,1.0,09.03.02 Информационные системы и технологии,173.0,0.0,0


In [199]:
df["Приказ о зачислении"].value_counts()

Приказ о зачислении
0    73298
1    10472
Name: count, dtype: int64

In [200]:
df["Сумма баллов"] += df["Сумма баллов за индивидуальные достижения"]
df = df.drop("Сумма баллов за индивидуальные достижения", axis=1)
df.head(5)

Unnamed: 0,Год приёмной компании,Пол,Ср. балл док-та об образовании,Приоритет,Направление подготовки,Сумма баллов,Приказ о зачислении
3,2019,М,3.733,3.0,27.03.04 Управление в технических системах,173.0,0
4,2019,М,3.733,2.0,15.03.04 Автоматизация технологических процесс...,173.0,0
7,2019,М,3.733,1.0,09.03.02 Информационные системы и технологии,173.0,0
12,2019,Ж,3.643,3.0,21.03.02 Землеустройство и кадастры,161.0,0
14,2019,Ж,3.643,4.0,20.03.01 Техносферная безопасность,161.0,0


In [201]:
columns = {
    "Год приёмной компании": "year",
    "Пол": "gender",
    "Ср. балл док-та об образовании": "gpa",
    "Приоритет": "priority",
    "Направление подготовки": "direction",
    "Сумма баллов": "points",
    "Приказ о зачислении": "result"
}

df = df.rename(columns=columns)
df.head(3)

Unnamed: 0,year,gender,gpa,priority,direction,points,result
3,2019,М,3.733,3.0,27.03.04 Управление в технических системах,173.0,0
4,2019,М,3.733,2.0,15.03.04 Автоматизация технологических процесс...,173.0,0
7,2019,М,3.733,1.0,09.03.02 Информационные системы и технологии,173.0,0


Оставляем только 1 приоритет:

In [202]:
df = df[df["priority"] == 1]
df = df.drop("priority", axis=1)
df["result"].value_counts()

result
0    13595
1     8162
Name: count, dtype: int64

### Кодирование категориальных признаков:

In [203]:
le = LabelEncoder()
le.fit(df["year"])
df["year"] = le.transform(df["year"])
df["year"].value_counts()

year
2    5180
3    4750
1    4570
0    4476
4    2781
Name: count, dtype: int64

Сохраняем 'обученный' label encoder:

In [204]:
joblib.dump(le, "label_encoder.joblib")

['label_encoder.joblib']

In [205]:
lb = LabelBinarizer()
lb.fit(df["gender"])
df["gender"] = lb.transform(df["gender"])
df["gender"].value_counts()

gender
1    14925
0     6832
Name: count, dtype: int64

Сохраняем 'обученный' label binarizer:

In [206]:
joblib.dump(lb, "label_binarizer.joblib")

['label_binarizer.joblib']

In [207]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_directions = ohe.fit_transform(df["direction"].to_numpy().reshape(-1, 1))
category_names = ohe.get_feature_names_out()

In [208]:
for i, category_name in enumerate(category_names):
    df[f"direction{category_name[2:]}"] = encoded_directions[:, i]
    
df.drop(columns='direction', inplace=True)

df.head(5)

Unnamed: 0,year,gender,gpa,points,result,direction_01.03.02 Прикладная математика и информатика,direction_02.03.01 Математика и компьютерные науки,direction_05.03.01 Геология,direction_08.03.01 Строительство,direction_08.05.00 Техника и технологии строительства,direction_08.05.01 Строительство уникальных зданий и сооружений,"direction_08.05.02 Строительство, эксплуатация, восстановление и техническое прикрытие автомобильных дорог, мостов и тоннелей",direction_09.03.00 Информатика и вычислительная техника,direction_09.03.01 Информатика и вычислительная техника,direction_09.03.02 Информационные системы и технологии,direction_12.03.01 Приборостроение,direction_12.03.04 Биотехнические системы и технологии,direction_13.03.01 Теплоэнергетика и теплотехника,direction_13.03.02 Электроэнергетика и электротехника,direction_15.03.01 Машиностроение,direction_15.03.04 Автоматизация технологических процессов и производств,direction_15.03.05 Конструкторско-технологическое обеспечение машиностроительных производств,direction_15.03.06 Мехатроника и робототехника,direction_18.03.00 Химические технологии,direction_18.03.01 Химическая технология,"direction_18.03.02 Энерго- и ресурсосберегающие процессы в химической технологии, нефтехимии и биотехнологии",direction_19.03.04 Технология продукции и организация общественного питания,direction_20.03.01 Техносферная безопасность,direction_21.03.01 Нефтегазовое дело,direction_21.03.02 Землеустройство и кадастры,"direction_21.05.00 Прикладная геология, горное дело, нефтегазовое дело и геодезия",direction_21.05.01 Прикладная геодезия,direction_21.05.02 Прикладная геология,direction_21.05.03 Технология геологической разведки,direction_22.03.01 Материаловедение и технологии материалов,direction_23.03.01 Технология транспортных процессов,direction_23.03.02 Наземные транспортно-технологические комплексы,direction_23.03.03 Эксплуатация транспортно-технологических машин и комплексов,direction_23.05.01 Наземные транспортно-технологические средства,direction_27.03.00 Управление в технических системах,direction_27.03.01 Стандартизация и метрология,direction_27.03.02 Управление качеством,direction_27.03.03 Системный анализ и управление,direction_27.03.04 Управление в технических системах,direction_27.03.05 Инноватика,direction_28.03.03 Наноматериалы,direction_41.03.06 Публичная политика и социальные науки,direction_43.03.00 Сервис и туризм,direction_43.03.01 Сервис,direction_43.03.03 Гостиничное дело,direction_45.03.04 Интеллектуальные системы в гуманитарной сфере
7,0,1,3.733,173.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0,1,3.314,148.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,0,1,4.333,196.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0,0,4.667,222.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0,0,4.667,220.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Загрузка обученного one hot encoder:

In [209]:
joblib.dump(ohe, "one_hot_encoder.joblib")

['one_hot_encoder.joblib']

Вывод количества Nan значений:

In [210]:
df.isna().sum().sum()

294

Заполнение Nan значений медианными значениями:

In [211]:
df.fillna(df.median(), inplace=True)
df.isna().sum().sum()

0

In [212]:
df.head(5)

Unnamed: 0,year,gender,gpa,points,result,direction_01.03.02 Прикладная математика и информатика,direction_02.03.01 Математика и компьютерные науки,direction_05.03.01 Геология,direction_08.03.01 Строительство,direction_08.05.00 Техника и технологии строительства,direction_08.05.01 Строительство уникальных зданий и сооружений,"direction_08.05.02 Строительство, эксплуатация, восстановление и техническое прикрытие автомобильных дорог, мостов и тоннелей",direction_09.03.00 Информатика и вычислительная техника,direction_09.03.01 Информатика и вычислительная техника,direction_09.03.02 Информационные системы и технологии,direction_12.03.01 Приборостроение,direction_12.03.04 Биотехнические системы и технологии,direction_13.03.01 Теплоэнергетика и теплотехника,direction_13.03.02 Электроэнергетика и электротехника,direction_15.03.01 Машиностроение,direction_15.03.04 Автоматизация технологических процессов и производств,direction_15.03.05 Конструкторско-технологическое обеспечение машиностроительных производств,direction_15.03.06 Мехатроника и робототехника,direction_18.03.00 Химические технологии,direction_18.03.01 Химическая технология,"direction_18.03.02 Энерго- и ресурсосберегающие процессы в химической технологии, нефтехимии и биотехнологии",direction_19.03.04 Технология продукции и организация общественного питания,direction_20.03.01 Техносферная безопасность,direction_21.03.01 Нефтегазовое дело,direction_21.03.02 Землеустройство и кадастры,"direction_21.05.00 Прикладная геология, горное дело, нефтегазовое дело и геодезия",direction_21.05.01 Прикладная геодезия,direction_21.05.02 Прикладная геология,direction_21.05.03 Технология геологической разведки,direction_22.03.01 Материаловедение и технологии материалов,direction_23.03.01 Технология транспортных процессов,direction_23.03.02 Наземные транспортно-технологические комплексы,direction_23.03.03 Эксплуатация транспортно-технологических машин и комплексов,direction_23.05.01 Наземные транспортно-технологические средства,direction_27.03.00 Управление в технических системах,direction_27.03.01 Стандартизация и метрология,direction_27.03.02 Управление качеством,direction_27.03.03 Системный анализ и управление,direction_27.03.04 Управление в технических системах,direction_27.03.05 Инноватика,direction_28.03.03 Наноматериалы,direction_41.03.06 Публичная политика и социальные науки,direction_43.03.00 Сервис и туризм,direction_43.03.01 Сервис,direction_43.03.03 Гостиничное дело,direction_45.03.04 Интеллектуальные системы в гуманитарной сфере
7,0,1,3.733,173.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0,1,3.314,148.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,0,1,4.333,196.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0,0,4.667,222.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0,0,4.667,220.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [213]:
df.to_csv("ТИУ Бакалавриат Абитуриенты Processed.csv")