In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
coll_dist = pd.read_csv("data/CollegeDistanceCleaned.csv")

In [3]:
coll_dist.head()

Unnamed: 0,gender,ethnicity,score,fcollege,mcollege,home,urban,unemp,wage,distance,tuition,education,income,region
0,1,2,39.150002,1,0,1,1,6.2,8.09,0.2,0.88915,12,0,0
1,0,2,48.869999,0,0,1,1,6.2,8.09,0.2,0.88915,12,1,0
2,1,2,48.740002,0,0,1,1,6.2,8.09,0.2,0.88915,12,1,0
3,1,0,40.400002,0,0,1,1,6.2,8.09,0.2,0.88915,12,1,0
4,0,2,40.48,0,0,0,1,5.6,8.09,0.4,0.88915,13,1,0


In [3]:
X = coll_dist.drop("score", axis=1)
y = coll_dist["score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

 # DecisionTree

In [11]:
tree_model = DecisionTreeRegressor(criterion="squared_error", max_depth=None)
tree_model.fit(X_train, y_train)

In [19]:
y_pred = tree_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")

MAE: 7.77030587229883


In [20]:
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

MSE: 94.06449287719879


In [21]:
tree_model_2 = DecisionTreeRegressor(criterion="absolute_error", max_depth=10)
tree_model_2.fit(X_train, y_train)
y_pred = tree_model_2.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

MAE: 6.636909354085158
MSE: 69.73311455519641


In [36]:
tree_model_2 = DecisionTreeRegressor(criterion="absolute_error", max_depth=4)
tree_model_2.fit(X_train, y_train)
y_pred = tree_model_2.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

MAE: 6.1114398879843925
MSE: 55.872398998729544


In [40]:
print(f"r2 score: {r2_score(y_test, y_pred)}")

r2 score: 0.26978943190142146


# Random Forest

In [41]:
random_forest_model = RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=None)
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")

MAE: 6.293460399445205
MSE: 59.68525949408347
r2 score: 0.21995819003088135


In [45]:
random_forest_model_2 = RandomForestRegressor(n_estimators=150, criterion="absolute_error", max_depth=10)
random_forest_model_2.fit(X_train, y_train)
y_pred = random_forest_model_2.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")

MAE: 6.00735368290028
MSE: 54.13116539506546
r2 score: 0.29254605595389505


In [46]:
random_forest_model_3 = RandomForestRegressor(n_estimators=150, criterion="absolute_error", max_depth=4)
random_forest_model_3.fit(X_train, y_train)
y_pred = random_forest_model_3.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")

MAE: 6.06551217414491
MSE: 54.7533372022301
r2 score: 0.2844147346412923


# Linear Regression

In [47]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")

MAE: 5.999723186927585
MSE: 53.029970954561485
r2 score: 0.306937845681873


# Gradient Boosting Machine

In [5]:
def within_10_percent_accuracy(y_true, y_pred):
    tolerance = 0.1
    within_tolerance = np.abs(y_pred - y_true) <= (tolerance * np.abs(y_true))
    accuracy = np.mean(within_tolerance) * 100
    return accuracy

In [9]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")
print(f"Accuracy within +-10%: {within_10_percent_accuracy(y_test, y_pred)}%")

MAE: 5.642095694628343
MSE: 48.19469718805719
r2 score: 0.35878482763328656
Accuracy within +-10%: 51.0548523206751%


In [14]:
gbr_2 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3)
gbr_2.fit(X_train, y_train)
y_pred = gbr_2.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")
print(f"Accuracy within +-10%: {within_10_percent_accuracy(y_test, y_pred)}%")

MAE: 5.636163185341653
MSE: 48.40384983659755
r2 score: 0.35600211792849357
Accuracy within +-10%: 51.371308016877634%


In [25]:
X_train.head()

Unnamed: 0,gender,ethnicity,fcollege,mcollege,home,urban,unemp,wage,distance,tuition,education,income,region
2142,0,2,0,0,1,0,14.0,12.15,3.5,1.16628,13,0,0
3042,1,1,0,0,1,1,5.9,9.92,0.1,0.45497,12,1,0
3033,0,1,0,0,1,0,12.8,9.92,0.1,0.45497,16,1,0
4274,0,2,0,0,1,1,7.2,8.89,0.3,0.25751,12,1,1
3469,1,2,0,0,1,1,14.0,12.15,0.8,1.16628,12,0,0


In [28]:
X_train_droped = X_train.drop(["ethnicity", "mcollege", "fcollege", "urban", "home"], axis=1)
X_test_droped = X_test.drop(["ethnicity", "mcollege", "fcollege", "urban", "home"], axis=1)

In [29]:
X_train_droped.head()

Unnamed: 0,gender,unemp,wage,distance,tuition,education,income,region
2142,0,14.0,12.15,3.5,1.16628,13,0,0
3042,1,5.9,9.92,0.1,0.45497,12,1,0
3033,0,12.8,9.92,0.1,0.45497,16,1,0
4274,0,7.2,8.89,0.3,0.25751,12,1,1
3469,1,14.0,12.15,0.8,1.16628,12,0,0


In [30]:
X_test_droped.head()

Unnamed: 0,gender,unemp,wage,distance,tuition,education,income,region
3496,0,6.4,9.76,0.3,0.48499,12,1,0
3211,1,8.2,7.54,0.7,0.81871,16,1,0
4470,0,6.8,8.89,1.5,0.25751,12,1,1
369,1,9.5,9.64,2.0,1.15242,14,0,0
242,1,9.8,9.64,4.5,1.15242,15,1,0


In [31]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gbr.fit(X_train_droped, y_train)
y_pred = gbr.predict(X_test_droped)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")
print(f"Accuracy within +-10%: {within_10_percent_accuracy(y_test, y_pred)}%")

MAE: 6.061017457912364
MSE: 54.88694024911792
r2 score: 0.26974665459168135
Accuracy within +-10%: 46.94092827004219%


In [32]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 5, 8],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

gbr_3 = GradientBoostingRegressor()

random_search = RandomizedSearchCV(
    estimator=gbr_3,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_gbr = random_search.best_estimator_
y_pred = best_gbr.predict(X_test)

print(f"Best Parameters: {random_search.best_params_}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")
print(f"Accuracy within +-10%: {within_10_percent_accuracy(y_test, y_pred)}%")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'subsample': 1.0, 'n_estimators': 200, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_depth': 2, 'learning_rate': 0.1}
MAE: 5.657729339562376
MSE: 48.17436695558207
r2 score: 0.35905531493337106
Accuracy within +-10%: 50.42194092827004%


In [33]:
gbr_4 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gbr_4.fit(X_train, y_train)
y_pred = gbr_4.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")
print(f"Accuracy within +-10%: {within_10_percent_accuracy(y_test, y_pred)}%")

MAE: 5.640711007660928
MSE: 48.19171082574647
r2 score: 0.3588245602374015
Accuracy within +-10%: 51.0548523206751%
[CV] END learning_rate=0.01, max_depth=2, min_samples_leaf=4, min_samples_split=2, n_estimators=200, subsample=0.7; total time=   0.8s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, min_samples_split=5, n_estimators=50, subsample=0.9; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.01, max_depth=4, min_samples_leaf=1, min_samples_split=5, n_estimators=300, subsample=0.9; total time=   1.9s
[CV] END learning_rate=0.2, max_depth=4, min_samples_leaf=1, min_samples_split=5, n_estimators=200, subsample=0.7; total time=   1.1s
[CV] END learning_rate=0.05, max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=50, subsample=0.7; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=5, min_samples_leaf=4, min_samples_spli

[CV] END learning_rate=0.01, max_depth=2, min_samples_leaf=4, min_samples_split=2, n_estimators=200, subsample=0.7; total time=   0.8s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, min_samples_split=5, n_estimators=50, subsample=0.9; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.01, max_depth=4, min_samples_leaf=1, min_samples_split=5, n_estimators=300, subsample=0.9; total time=   1.8s
[CV] END learning_rate=0.2, max_depth=4, min_samples_leaf=1, min_samples_split=5, n_estimators=200, subsample=0.7; total time=   1.1s
[CV] END learning_rate=0.05, max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=50, subsample=0.7; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=5, min_samples_leaf=4, min_samples_split=2, n_estimators=50, subsample=0.9; total time=   0.4s
[CV] END learning_rate=0.2, max_depth=4, min_samples_leaf=4,

[CV] END learning_rate=0.01, max_depth=2, min_samples_leaf=4, min_samples_split=2, n_estimators=200, subsample=0.7; total time=   0.8s
[CV] END learning_rate=0.05, max_depth=2, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.7; total time=   0.4s
[CV] END learning_rate=0.05, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.1, max_depth=2, min_samples_leaf=2, min_samples_split=8, n_estimators=300, subsample=0.8; total time=   1.1s
[CV] END learning_rate=0.2, max_depth=4, min_samples_leaf=1, min_samples_split=5, n_estimators=200, subsample=0.7; total time=   1.1s
[CV] END learning_rate=0.05, max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=50, subsample=0.7; total time=   0.3s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=4

[CV] END learning_rate=0.01, max_depth=2, min_samples_leaf=4, min_samples_split=2, n_estimators=200, subsample=0.7; total time=   0.8s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=2, min_samples_split=5, n_estimators=50, subsample=0.9; total time=   0.3s
[CV] END learning_rate=0.05, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100, subsample=1.0; total time=   0.5s
[CV] END learning_rate=0.1, max_depth=2, min_samples_leaf=2, min_samples_split=8, n_estimators=300, subsample=0.8; total time=   1.1s
[CV] END learning_rate=0.2, max_depth=4, min_samples_leaf=1, min_samples_split=5, n_estimators=200, subsample=0.7; total time=   1.1s
[CV] END learning_rate=0.1, max_depth=3, min_samples_leaf=4, 

In [11]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
gbr.fit(X_train, y_train)

In [12]:
y_pred = gbr.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"r2 score: {r2_score(y_test, y_pred)}")
print(f"Accuracy within +-10%: {within_10_percent_accuracy(y_test, y_pred)}%")

MAE: 5.69752798459406
MSE: 50.213437297451335
r2 score: 0.3319630910488348
Accuracy within +-10%: 51.371308016877634%


In [13]:
import joblib
joblib.dump(gbr, "models/gradient_boosting_model.joblib")

['gradient_boosting_model.joblib']