In [1]:
# load
import pandas as pd
import lightgbm

data = pd.read_csv("X_train.csv", index_col=0)
data["mark"] = pd.read_csv("y_train.csv", index_col=0)["mark"]

X_validation = pd.read_csv("X_test.csv", index_col=0)

stud_info = pd.read_csv("studs_info.csv", index_col=0)

In [2]:
# rename columns
field_map = {
    "STD_ID": "stud",
    "НАПРАВЛЕНИЕ": "profession",
    "ГОД": "year",
    "АТТЕСТАЦИЯ": "exam_type",
    "ДИСЦИПЛИНА": "discipline",
    "КУРС": "course",
    "СЕМЕСТР": "semester",
}

for field_name, new_field_name in field_map.items():
    data[new_field_name] = data.pop(field_name)
    X_validation[new_field_name] = X_validation.pop(field_name)

In [3]:
%%capture
# clean up
# for some student: year == course + const
# for some student: course == ceil(semester / 2)
# therefore they are noise
fields = ["year", "course"]
data.drop(fields, axis=1, inplace=True)
X_validation.drop(fields, axis=1, inplace=True)

In [4]:
stud_info

Unnamed: 0_level_0,number,Пол,Статус,Дата выпуска,Категория обучения,Форма обучения,Шифр,направление (специальность),Unnamed: 9_level_0,Образование,Дата выдачи,Что именно закончил
STD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
99136,13951.0,Ж,ВЫП,2019-08-31,БП,Д,31.05.01,Лечебное дело,2629.0,СР,2013-06-26 00:00:00,ИНОГОРОДНЯЯ СРЕДНЯЯ ШКОЛА
99112,13998.0,Ж,ВЫП,2019-08-31,БП,Д,31.05.01,Лечебное дело,8240.0,СР,2012-06-23 00:00:00,"МОУ СОШ N 2, ПЕСТОВО"
99109,13999.0,Ж,ВЫП,2019-08-31,БП,Д,31.05.01,Лечебное дело,2314.0,СР,2013-06-25 00:00:00,ИНОГОРОДНЯЯ СРЕДНЯЯ ШКОЛА
99125,13953.0,Ж,ВЫП,2019-08-31,БП,Д,31.05.01,Лечебное дело,2345.0,СР,2013-06-23 00:00:00,"Гимназия ""ГАРМОНИЯ"""
100676,13950.0,М,ВЫП,2019-08-31,БП,Д,31.05.01,Лечебное дело,2650.0,СР,2012-06-29 00:00:00,ИНОГОРОДНЯЯ СРЕДНЯЯ ШКОЛА
...,...,...,...,...,...,...,...,...,...,...,...,...
100678,978.0,Ж,СТ,,БП,Д,31.05.01,Лечебное дело,2388.0,СР,2013-06-21 00:00:00,ИНОГОРОДНЯЯ СРЕДНЯЯ ШКОЛА
100663,1021.0,Ж,СТ,,ДОГ,Д,31.05.01,Лечебное дело,2554.0,СР,2013-06-27 00:00:00,ИНОГОРОДНЯЯ СРЕДНЯЯ ШКОЛА
191630,6447.0,Ж,СТ,,ДОГ,Д,31.05.01,Лечебное дело,6692.0,СР,2014-05-31 00:00:00,ИНОГОРОДНЯЯ СРЕДНЯЯ ШКОЛА
191625,3765.0,Ж,СТ,,ДОГ,Д,31.05.01,Лечебное дело,,,,


In [5]:
data

Unnamed: 0,mark,stud,profession,exam_type,discipline,semester
51081,5,175711,38.03.02 Менеджмент,Экзамен,Маркетинг,4.0
72921,5,100647,31.05.01 Лечебное дело,Экзамен,"Пропедевтика внутренних болезней, лучевая диаг...",5.0
96438,5,199311,31.05.03 Стоматология,Экзамен,Химия,1.0
273044,4,162002,38.03.02 Менеджмент,Экзамен,Статистика,3.0
371309,3,108384,31.05.01 Лечебное дело,Экзамен,Педиатрия,9.0
...,...,...,...,...,...,...
224856,3,176141,40.03.01 Юриспруденция,Курсовая работа,Конституционное право,2.0
260998,3,182794,31.05.01 Лечебное дело,Экзамен,Нормальная физиология,4.0
57245,4,162391,31.05.01 Лечебное дело,Дифференцированный зачет,Иностранный язык,2.0
218529,4,202479,38.03.01 Экономика,Дифференцированный зачет,Информатика,1.0


In [6]:
set(data["exam_type"])

{'Дифференцированный зачет',
 'Зачет',
 'Курсовая работа',
 'Курсовой проект',
 'Экзамен'}

In [7]:
# data[data["exam_type"] == "Экзамен"].sort_values(by=["mark"]).groupby(["stud"]).count()

In [8]:
# encode labels
from sklearn import preprocessing

fields = "discipline", "profession", "exam_type"
le_s = [preprocessing.LabelEncoder() for _ in fields]
for field_name, le in zip(fields, le_s):
    le.fit(pd.concat([data[field_name], X_validation[field_name]]))

    data[field_name] = le.transform(data[field_name])
    X_validation[field_name] = le.transform(X_validation[field_name])

# 69.0 to 69
field_name = "semester"
data[field_name] = data[field_name].astype(int)
X_validation[field_name] = X_validation[field_name].astype(int)

# normalize
data["semester"] -= 1

In [9]:
# means
fields = "stud", "profession", "discipline"
for field_name in fields:
    mean_mark = data.groupby(field_name).mean()["mark"]

    mean_name = field_name + "_mean"
    data[mean_name] = data[field_name].map(mean_mark)
    X_validation[mean_name] = X_validation[field_name].map(mean_mark)

In [10]:
data

Unnamed: 0,mark,stud,profession,exam_type,discipline,semester,stud_mean,profession_mean,discipline_mean
51081,5,175711,36,4,408,3,4.181818,4.157895,4.326389
72921,5,100647,24,4,946,4,4.116279,4.205835,4.266444
96438,5,199311,25,4,1326,0,4.750000,4.119935,3.569940
273044,4,162002,36,4,1100,2,4.172414,4.157895,3.721519
371309,3,108384,24,4,713,8,3.542857,4.205835,4.333919
...,...,...,...,...,...,...,...,...,...
224856,3,176141,40,2,364,1,3.615385,3.951955,3.833866
260998,3,182794,24,4,556,3,4.052632,4.205835,3.693413
57245,4,162391,24,0,228,1,4.458333,4.205835,4.153555
218529,4,202479,35,0,238,0,4.187500,4.114599,3.984946


In [11]:
# data[(data["stud"] == 171886) ].sort_values(by=["semester"])
# data[(data["stud"] == 100647) ].sort_values(by=["semester"])
# data[(data["year"] != data["course"]) ]
# data[data["semester"] == 0 ]

In [12]:
# create dummy variables
columns = []#"exam_type"]#, "discipline", "profession"]
data = pd.get_dummies(data, columns=columns)
X_validation = pd.get_dummies(X_validation, columns=columns)

# remove unneeded data
fields = ["stud", "discipline", "profession"]
data.drop(fields, axis=1, inplace=True)
X_validation.drop(fields, axis=1, inplace=True)

In [13]:
data

Unnamed: 0,mark,exam_type,semester,stud_mean,profession_mean,discipline_mean
51081,5,4,3,4.181818,4.157895,4.326389
72921,5,4,4,4.116279,4.205835,4.266444
96438,5,4,0,4.750000,4.119935,3.569940
273044,4,4,2,4.172414,4.157895,3.721519
371309,3,4,8,3.542857,4.205835,4.333919
...,...,...,...,...,...,...
224856,3,2,1,3.615385,3.951955,3.833866
260998,3,4,3,4.052632,4.205835,3.693413
57245,4,0,1,4.458333,4.205835,4.153555
218529,4,0,0,4.187500,4.114599,3.984946


In [14]:
from sklearn.model_selection import train_test_split

marks = data.pop("mark")
X_train, X_test, y_train, y_test = train_test_split(data, marks, shuffle=True, test_size=0.2)

# создание и обучение модели на основе бустинга рещающих деревьев
lgb = lightgbm.LGBMRegressor(n_estimators=1000, verbose=1)

lgb.fit(X_train, y_train)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 579
[LightGBM] [Info] Number of data points in the train set: 84046, number of used features: 5
[LightGBM] [Info] Start training from score 4.184827


LGBMRegressor(n_estimators=1000, verbose=1)

In [15]:
from sklearn.metrics import mean_absolute_error, r2_score

print(mean_absolute_error(y_test, lgb.predict(X_test)))
print(r2_score(y_test, lgb.predict(X_test)))

y_pred = pd.read_csv("sample_submission.csv", index_col=0)
y_pred["mark"] = lgb.predict(X_validation)

y_pred.to_csv("baseline_submission.csv")


0.38796153220213675
0.5552438757295284
