In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()


df = pd.read_excel('final_dataset.xlsx', index_col=0)
ord = OrdinalEncoder()
df = df.drop('name', axis=1)
df['code'] = ord.fit_transform(df[['code']])

Y = df[['pop_growth']]
X = df.drop('pop_growth', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [69]:
from sklearn.linear_model import RANSACRegressor, LinearRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pickle
import math

# Set RANSAC hyperparameters
ransac = RANSACRegressor(LinearRegression(),
		max_trials=4, 		# Number of Iterations
		min_samples=2, 		# Minimum size of the sample
		loss='absolute_error', 	# Metrics for loss
		residual_threshold=10 	# Threshold
		)

param_grid = {
	'max_trials': [4, 5, 6, 7],
    'min_samples': [1, 2, 3, 4, 5]
}

cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
grid = GridSearchCV(ransac, param_grid, refit = True, verbose = 1,n_jobs=-1,cv=cv)
grid.fit(X_train, y_train)
print(grid.best_params_)
pickle.dump(grid.best_estimator_, open('ransac_reg', 'wb'))
y_pred = grid.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))


lin = LinearRegression()
lin.fit(X_train, y_train)
y_pred = lin.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))
pickle.dump(grid.best_estimator_, open('lin_reg', 'wb'))

Fitting 15 folds for each of 20 candidates, totalling 300 fits
{'max_trials': 7, 'min_samples': 1}
0.29710013131382546
0.29734631076158274


In [70]:
from sklearn.preprocessing import PolynomialFeatures

for i in range(1, 3):
    poly = PolynomialFeatures(degree= i + 1)
    poly_features = poly.fit_transform(X)
    X_train_PR, X_test_PR, y_train_PR, y_test_PR = train_test_split(poly_features, 
                                        Y, test_size = 0.2, random_state=42)
    lin.fit(X_train_PR, y_train_PR)
    y_pred_PR = lin.predict(X_test_PR)
    pickle.dump(grid.best_estimator_, open('poly_' + str(i) + '_reg', 'wb'))
    print(str(math.sqrt(mean_squared_error(y_test, y_pred))) + '  ' + str(i + 1))

0.29734631076158274  2
0.29734631076158274  3


In [71]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedStratifiedKFold


param_grid = {
    'max_depth': [i for i in range(10, 20)],
    'min_samples_split': [2, 3, 4]
}

tree=DecisionTreeRegressor()
grid = GridSearchCV(tree, param_grid, refit = True, verbose = 1,n_jobs=-1,cv=cv)
grid.fit(X_train, y_train)
print(grid.best_params_)
pickle.dump(grid.best_estimator_, open('dec_tree_reg', 'wb'))
y_pred = grid.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))

Fitting 15 folds for each of 30 candidates, totalling 450 fits
{'max_depth': 18, 'min_samples_split': 2}
0.41371067627798636


In [72]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedStratifiedKFold


param_grid = {
    'n_estimators': [95, 100, 105, 110],
    'min_samples_split': [2, 3, 4],
    'max_depth': range(8, 13)
}

forest = RandomForestRegressor()
grid = GridSearchCV(forest, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order = 'C'))
pickle.dump(grid.best_estimator_, open('rand_forest_reg', 'wb'))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))

Fitting 15 folds for each of 60 candidates, totalling 900 fits
{'max_depth': 12, 'min_samples_split': 2, 'n_estimators': 110}
0.37959745917246424


In [73]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RepeatedStratifiedKFold

param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25]
}

model = KNeighborsRegressor()
grid = GridSearchCV(model, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
pickle.dump(grid.best_estimator_, open('knn_reg', 'wb'))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))

Fitting 15 folds for each of 5 candidates, totalling 75 fits
{'n_neighbors': 5}
1.0228246770458636


In [74]:
##SVM
from sklearn.svm import SVR
from sklearn.model_selection import RepeatedStratifiedKFold


param_grid = {
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'C': [0.5, 1.0, 2.0]
}

svmregressor = SVR()
grid = GridSearchCV(svmregressor, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
print(grid.best_params_)
pickle.dump(grid.best_estimator_, open('svm_reg', 'wb'))
y_pred = grid.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))


Fitting 15 folds for each of 18 candidates, totalling 270 fits
{'C': 2.0, 'degree': 2, 'gamma': 'scale'}
1.1704511959029027


In [75]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np

model = XGBRegressor(n_estimators=1000, max_depth=10, eta=0.1, subsample=0.7,
                         colsample_bytree=0.8)

param_grid = {
    'base_score': [0.5, 1, 1.5, 2], 
    'max_depth': [3, 4, 5],
    'colsample_bylevel': [1, 2, 3]
}

grid = GridSearchCV(model, param_grid, refit = True, verbose = 1,n_jobs=-1, cv=cv)
grid.fit(X_train, np.ravel(y_train, order='C'))
pickle.dump(grid.best_estimator_, open('xgb_reg', 'wb'))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(math.sqrt(mean_squared_error(y_test, y_pred)))

Fitting 15 folds for each of 36 candidates, totalling 540 fits


360 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/xgboost/sklearn.py", line 1051, in fit
    self._Booster = train(
  File "/media/majed/my_partition/venv/lib/python3.8/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)


{'base_score': 0.5, 'colsample_bylevel': 1, 'max_depth': 5}
0.23416913139135476
