# Анализ факторов, влияющих на успеваемость студентов

## Подготовка к работе

In [1]:
# Импорт необходимых для работы библиотек

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.feature_selection import RFE

In [2]:
# Отключение предупреждений в итоговом отчёте

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Выгрузка предобработанной версии датасета `jamb` 

jamb = pd.read_csv('/Users/mimikhailova/ВШЭ/jamb.csv')

In [4]:
jamb.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,Unknown,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


In [5]:
jamb.columns

Index(['jamb_score', 'study_hours_per_week', 'attendance_rate',
       'teacher_quality', 'distance_to_school', 'school_type',
       'school_location', 'extra_tutorials', 'access_to_learning_materials',
       'parent_involvement', 'it_knowledge', 'age', 'gender',
       'socioeconomic_status', 'parent_education_level',
       'assignments_completed'],
      dtype='object')

## Построение модели. Оценка качества модели

#### Разделим данные на фичи и таргет:

In [6]:
X = jamb.drop(columns=['jamb_score'])
y = jamb['jamb_score']

#### Разобьем выборку на обучающую и тестовую в соотношении 80/20: 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Обучим модель только на вещественных признаках:

In [8]:
X_train_float = X_train.drop(columns=['school_type', 'school_location', 'extra_tutorials', \
                                      'access_to_learning_materials', 'parent_involvement',\
                                     'it_knowledge', 'gender', 'socioeconomic_status', 'parent_education_level'])

In [9]:
X_train_float.shape

(4000, 6)

In [10]:
X_test_float = X_test.drop(columns=['school_type', 'school_location', 'extra_tutorials', \
                                      'access_to_learning_materials', 'parent_involvement',\
                                     'it_knowledge', 'gender', 'socioeconomic_status', 'parent_education_level']) 

In [11]:
X_test_float.shape

(1000, 6)

Построим модель линейной регрессии:

In [12]:
# Стандартизация данных

scaler = StandardScaler()
scaler.fit(X_train_float)
X_train_fl_st = scaler.transform(X_train_float)
X_test_fl_st = scaler.transform(X_test_float)

# Алгоритм для модели и обучение

lr_model = LinearRegression()
lr_model.fit(X_train_fl_st, y_train)

# Подсчет предсказаний

y_pred_lr_st_train = lr_model.predict(X_train_fl_st)
y_pred_lr_st_test = lr_model.predict(X_test_fl_st)

# Подсчет метрик модели

print('MSE LR train: {:.4f}'.format(mean_squared_error(y_train, y_pred_lr_st_train)))
print('MSE LR test: {:.4f}'.format(mean_squared_error(y_test, y_pred_lr_st_test)))
print('R2 LR train: {:.4f}'.format(r2_score(y_train, y_pred_lr_st_train)))
print('R2 LR test: {:.4f}'.format(r2_score(y_test, y_pred_lr_st_test)))

MSE LR train: 1595.9340
MSE LR test: 1658.6615
R2 LR train: 0.2843
R2 LR test: 0.3127


* Качество модели совсем неважное и нужно попробовать его улучшить 

#### Добавим категориальные фичи, закодировав их с помощью ОНЕ:

In [13]:
# Применяем для обучающей выборки 

ohe = OneHotEncoder(drop='first', sparse=False)

X_train_ohe = ohe.fit_transform(X_train[['school_type', 'school_location', 'extra_tutorials',
                                           'access_to_learning_materials', 'parent_involvement',
                                           'it_knowledge', 'gender', 'socioeconomic_status', 'parent_education_level']])
X_train_ohe_df = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names_out())

# Применяем для тестовой выборки 

X_test_ohe = ohe.transform(X_test[['school_type', 'school_location', 'extra_tutorials',
                                     'access_to_learning_materials', 'parent_involvement',
                                     'it_knowledge', 'gender', 'socioeconomic_status', 'parent_education_level']])
X_test_ohe_df = pd.DataFrame(X_test_ohe, columns=ohe.get_feature_names_out())

In [14]:
# Конкатенация

X_train_new = pd.concat([X_train[['study_hours_per_week', 'attendance_rate', 'teacher_quality',
                                    'distance_to_school', 'age', 'assignments_completed']].reset_index(drop=True), 
                          X_train_ohe_df.reset_index(drop=True)], axis=1)

X_test_new = pd.concat([X_test[['study_hours_per_week', 'attendance_rate', 'teacher_quality',
                                    'distance_to_school', 'age', 'assignments_completed']].reset_index(drop=True), 
                          X_test_ohe_df.reset_index(drop=True)], axis=1)

Вновь построим модель линейной регрессии:

In [15]:
# Стандартизация данных

scaler = StandardScaler()
scaler.fit(X_train_new)
X_train_new_st = scaler.transform(X_train_new)
X_test_new_st = scaler.transform(X_test_new)

# Алгоритм для модели и обучение

lr_model = LinearRegression()
lr_model.fit(X_train_new_st, y_train)

# Подсчет предсказаний

y_pred_new_train = lr_model.predict(X_train_new_st)
y_pred_new_test = lr_model.predict(X_test_new_st)

# Подсчет метрик модели

print('MSE LR train with cat: {:.4f}'.format(mean_squared_error(y_train, y_pred_new_train)))
print('MSE LR test with cat: {:.4f}'.format(mean_squared_error(y_test, y_pred_new_test)))
print('R2 LR train with cat: {:.4f}'.format(r2_score(y_train, y_pred_new_train)))
print('R2 LR test with cat: {:.4f}'.format(r2_score(y_test, y_pred_new_test)))

MSE LR train with cat: 1474.7224
MSE LR test with cat: 1527.2309
R2 LR train with cat: 0.3387
R2 LR test with cat: 0.3672


* Стало лучше, но все еще плохо

#### Поработаем над улучшением  модели

* Применим метод регуляризации Lasso с подбором параметра alpha:

In [16]:
# Подбор гиперпараметров через GridSearchCV

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

base_model = Lasso()

grid = GridSearchCV(base_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid.fit(X_train_new_st, y_train)

print(f'Лучшие параметры: {grid.best_params_}')

Лучшие параметры: {'alpha': 0.01}


In [17]:
# Алгоритм модели и обучение

lasso_model = Lasso(alpha=0.01)  
lasso_model.fit(X_train_new_st, y_train)

# Подсчет предсказаний

y_pred_train_lasso = lasso_model.predict(X_train_new_st)
y_pred_test_lasso = lasso_model.predict(X_test_new_st)

# Подсчет метрик модели

print('MSE lasso train: {:.4f}'.format(mean_squared_error(y_train, y_pred_train_lasso)))
print('MSE lasso test: {:.4f}'.format(mean_squared_error(y_test, y_pred_test_lasso)))
print('R2 lasso train: {:.4f}'.format(r2_score(y_train, y_pred_train_lasso)))
print('R2 lasso test: {:.4f}'.format(r2_score(y_test, y_pred_test_lasso)))

MSE lasso train: 1474.7247
MSE lasso test: 1527.2569
R2 lasso train: 0.3387
R2 lasso test: 0.3672


* Значимых изменений нет

#### Применим градиентный бустинг, предварительно подобрав парамеры с помощью GridSearchCV

In [18]:
# Подбор гиперпараметров через GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]}

base_model = GradientBoostingRegressor(random_state=42)

grid = GridSearchCV(base_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid.fit(X_train_new, y_train)

print(f'Лучшие параметры: {grid.best_params_}')

Лучшие параметры: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}


In [19]:
# Алгоритм модели и обучение

gb_model = GradientBoostingRegressor(random_state=42, learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=100)
gb_model.fit(X_train_new, y_train)

# Подсчет предсказаний

y_pred_gb_train = gb_model.predict(X_train_new)
y_pred_gb_test = gb_model.predict(X_test_new)

# Подсчет метрик модели

print('MSE GB train: {:.2f}'.format(mean_squared_error(y_train, y_pred_gb_train)))
print('MSE GB test: {:.2f}'.format(mean_squared_error(y_test, y_pred_gb_test)))
print('R2 GB train: {:.2f}'.format(r2_score(y_train, y_pred_gb_train)))
print('R2 GB test: {:.2f}'.format(r2_score(y_test, y_pred_gb_test)))

MSE GB train: 1312.91
MSE GB test: 1598.74
R2 GB train: 0.41
R2 GB test: 0.34


* Качество снизилось

#### Проанализируем важность признаков c помощью модели Random Forest, которая имеет встроенный метод для оценки:

In [20]:
# Названия признаков

feature_names = X_train_new.columns

# Подсчет предсказаний

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_new_st, y_train)

# Получение важности признаков

importances = rf_model.feature_importances_
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
3,distance_to_school,0.139636
1,attendance_rate,0.125508
0,study_hours_per_week,0.122539
4,age,0.093863
2,teacher_quality,0.059802
5,assignments_completed,0.048328
14,gender_Male,0.037609
8,extra_tutorials_Yes,0.035439
7,school_location_Urban,0.035268
9,access_to_learning_materials_Yes,0.032842


Выведем топ-10 признаков:

In [21]:
top_features = importance_df.head(10)['feature'].tolist()
top_features

['study_hours_per_week',
 'attendance_rate',
 'teacher_quality',
 'distance_to_school',
 'age',
 'assignments_completed',
 'school_type_Public',
 'school_location_Urban',
 'extra_tutorials_Yes',
 'access_to_learning_materials_Yes']

Обучим модель линейной регрессии на топ-10 признаков:

In [22]:
# Отбираем только нужные признаки

X_train_top = X_train_new[top_features]
X_test_top = X_test_new[top_features]

# Стандартизация данных

scaler = StandardScaler()
scaler.fit(X_train_top)
X_train_top_st = scaler.transform(X_train_top)
X_test_top_st = scaler.transform(X_test_top)

# Алгоритм для модели и обучение

lr_model = LinearRegression()
lr_model.fit(X_train_top_st, y_train)

# Подсчет предсказаний

y_pred_top_train = lr_model.predict(X_train_top_st)
y_pred_top_test = lr_model.predict(X_test_top_st)

# Подсчет метрик модели

print('MSE LR train top-10: {:.4f}'.format(mean_squared_error(y_train, y_pred_top_train)))
print('MSE LR test top-10: {:.4f}'.format(mean_squared_error(y_test, y_pred_top_test)))
print('R2 LR train top-10: {:.4f}'.format(r2_score(y_train, y_pred_top_train)))
print('R2 LR test top-10: {:.4f}'.format(r2_score(y_test, y_pred_top_test)))

MSE LR train top-10: 1582.8187
MSE LR test top-10: 1642.1375
R2 LR train top-10: 0.2902
R2 LR test top-10: 0.3196


* Качество снизилось