In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df = pd.read_csv('jamb_exam_results.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

df = df.drop(columns=['student_id'])

df = df.fillna(0)

X = df.drop(columns=['jamb_score'])
y = df['jamb_score']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

dv = DictVectorizer(sparse=True)
X_train_dict = X_train.to_dict(orient='records')
X_valid_dict = X_valid.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

X_train_vec = dv.fit_transform(X_train_dict)
X_valid_vec = dv.transform(X_valid_dict)
X_test_vec = dv.transform(X_test_dict)


Вопрос 1: Обучение дерева решений с max_depth=1

In [67]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(max_depth=1, random_state=1)
tree_model.fit(X_train_vec, y_train)

best_feature = dv.get_feature_names_out()[tree_model.feature_importances_.argmax()]
best_feature


'study_hours_per_week'

Вопрос 2

In [68]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train_vec, y_train)

y_pred_valid = rf_model.predict(X_valid_vec)

rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
rmse


43.157758977963624

Вопрос 3

In [69]:

n_estimators_range = range(10, 201, 10)
rmse_scores = []

for n_estimators in n_estimators_range:
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf_model.fit(X_train_vec, y_train)
    y_pred_valid = rf_model.predict(X_valid_vec)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
    rmse_scores.append(rmse)

best_n_estimators = n_estimators_range[np.argmin(rmse_scores)]
best_n_estimators


180

Вопрос 4

In [70]:

max_depth_range = [10, 15, 20, 25]
rmse_scores_depth = {}

for max_depth in max_depth_range:
    rmse_scores_n_estimators = []
    for n_estimators in n_estimators_range:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf_model.fit(X_train_vec, y_train)
        y_pred_valid = rf_model.predict(X_valid_vec)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
        rmse_scores_n_estimators.append(rmse)
    rmse_scores_depth[max_depth] = np.mean(rmse_scores_n_estimators)


best_max_depth = min(rmse_scores_depth, key=rmse_scores_depth.get)
best_max_depth


10

Вопрос 5

In [None]:

rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train_vec, y_train)


importances = rf_model.feature_importances_


most_important_feature_index = np.argmax(importances)
most_important_feature = dv.get_feature_names_out()[most_important_feature_index]
most_important_feature


Вопрос 6

In [71]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error


train_dmatrix = xgb.DMatrix(X_train_vec, label=y_train)
valid_dmatrix = xgb.DMatrix(X_valid_vec, label=y_valid)

watchlist = [(train_dmatrix, 'train'), (valid_dmatrix, 'valid')]

params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

xgb_model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

y_pred_valid_xgb = xgb_model.predict(valid_dmatrix)
rmse_xgb = np.sqrt(mean_squared_error(y_valid, y_pred_valid_xgb))
rmse_xgb

params['eta'] = 0.1
xgb_model_eta_01 = xgb.train(params, train_dmatrix, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

y_pred_valid_xgb_01 = xgb_model_eta_01.predict(valid_dmatrix)
rmse_xgb_01 = np.sqrt(mean_squared_error(y_valid, y_pred_valid_xgb_01))
rmse_xgb_01


[0]	train-rmse:42.84835	valid-rmse:44.52338
[1]	train-rmse:39.96423	valid-rmse:42.83406
[2]	train-rmse:37.91231	valid-rmse:41.62607
[3]	train-rmse:36.51126	valid-rmse:41.25491
[4]	train-rmse:35.52212	valid-rmse:40.84075
[5]	train-rmse:34.77126	valid-rmse:40.71677
[6]	train-rmse:34.03898	valid-rmse:40.72669
[7]	train-rmse:33.62820	valid-rmse:40.68822
[8]	train-rmse:32.94729	valid-rmse:40.81273
[9]	train-rmse:32.27703	valid-rmse:40.84939
[10]	train-rmse:31.73818	valid-rmse:40.83759
[11]	train-rmse:31.31360	valid-rmse:40.80575
[12]	train-rmse:30.72949	valid-rmse:40.84238
[13]	train-rmse:30.11486	valid-rmse:40.96020
[14]	train-rmse:29.43538	valid-rmse:40.98775
[15]	train-rmse:29.23018	valid-rmse:41.04798
[16]	train-rmse:28.64113	valid-rmse:41.08375
[0]	train-rmse:45.64414	valid-rmse:46.63724
[1]	train-rmse:44.26862	valid-rmse:45.58724
[2]	train-rmse:43.08569	valid-rmse:44.76209
[3]	train-rmse:42.05227	valid-rmse:44.02498
[4]	train-rmse:41.10533	valid-rmse:43.40640
[5]	train-rmse:40.28309	v

40.25735690723187