In [3]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot

In [4]:
#Import dataset
dataset = pd.read_csv('weight-height.csv')
dataset.apply(lambda x:sum(x.isnull()))
dataset.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [11]:
dataset.head()
dataset.shape

(10000, 3)

In [12]:
X = dataset.iloc[:,0:2].values
y = dataset.iloc[:,2].values
y

array([241.89356318, 162.31047252, 212.74085556, ..., 128.47531878,
       163.85246135, 113.64910268])

In [13]:
#Categorical split
from sklearn.preprocessing import LabelEncoder
label_x = LabelEncoder()
X[:,0] = label_x.fit_transform(X[:,0])

In [14]:
#Splitting into test and training set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [15]:
#Fitting Linear Regression
from sklearn.linear_model import LinearRegression
regressor_Linear = LinearRegression()
regressor_Linear.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
ypred_linear = regressor_Linear.predict(X_test)


In [18]:
#R-squared and rmse value for Linear Regression
from sklearn.metrics import r2_score, mean_squared_error

print('R-squared train score: {:.2f}'.format(regressor_Linear.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(regressor_Linear.score(X_test,y_test)))
print('Root mean squared error(Linear): {:.2f}'.format((mean_squared_error(y_test,ypred_linear))))

R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(Linear): 102.40


In [19]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor_Linear, X = X_train, y = y_train, cv = 10)
accuracies.mean()

0.9026761281280986

In [20]:
#Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
X_poly_test = poly_reg.transform(X_test)
linear_reg = LinearRegression().fit(X_poly,y_train)


In [21]:
ypred_Poly = linear_reg.predict(X_poly_test)

In [22]:
#R-squared and rms value for Polynomial
print('R-squared train score: {:.2f}'.format(linear_reg.score(X_poly,y_train)))
print('R-squared test score: {:.2f}'.format(linear_reg.score(X_poly_test,y_test)))
print('Root mean squared error(Polynomial): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_Poly))))


R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(Polynomial): 10.13


In [23]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = linear_reg, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.9026761281280986

In [24]:
#Fitting SVR
from sklearn.svm import SVR
regressor_svr = SVR(C = 800, kernel = 'rbf',gamma = 0.1).fit(X_train,y_train)
ypred_SVR = regressor_svr.predict(X_test)

In [25]:
#R-squared and rms value for SVR
print('R-squared train score: {:.2f}'.format(regressor_svr.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(regressor_svr.score(X_test,y_test)))
print('Root mean squared error(SVR): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_SVR))))

R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(SVR): 10.18


In [26]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = regressor_svr, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.9018942939528343

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1,10,100,1000],'kernel':['linear']},
              {'C':[1,10,100,1000], 'kernel':['rbf'],'gamma': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
               'epsilon':[0.1,0.2,0.3,0.4,0.5]}]
grid_search = GridSearchCV(estimator = regressor_svr,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

In [37]:
#KNN Regression
from sklearn.neighbors import KNeighborsRegressor
regression_KNN = KNeighborsRegressor(n_neighbors = 10).fit(X_train,y_train)
ypred_KNN = regression_KNN.predict(X_test)


In [38]:
#R-squared and rms value for KNN Regression
print('R-squared train score: {:.2f}'.format(regression_KNN.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(regression_KNN.score(X_test,y_test)))
print('Root mean squared error(KNN): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_KNN))))

R-squared train score: 0.91
R-squared test score: 0.89
Root mean squared error(KNN): 10.74


In [39]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = regression_KNN, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.8922922226706385

In [31]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors':range(1,31)}
grid_search = GridSearchCV(estimator = regression_KNN,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_r2 = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_r2)
print(best_parameters)

0.8987839761793451
{'n_neighbors': 29}


In [41]:
#Decision Trees
from sklearn.tree import DecisionTreeRegressor
regressor_DT = DecisionTreeRegressor(max_depth = 5).fit(X_train,y_train)
ypred_DT = regressor_DT.predict(X_test)


In [42]:
#R-squared and rms value for Decision Tree
print('R-squared train score: {:.2f}'.format(regressor_DT.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(regressor_DT.score(X_test,y_test)))
print('Root mean squared error(Decision Tree): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_DT))))

R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(Decision Tree): 10.36


In [43]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = regressor_DT, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.8996097364176882

In [44]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':range(1,11)}
grid_search = GridSearchCV(estimator = regressor_DT,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.8996097364176882
{'max_depth': 5}


In [49]:
#Fitting Random Forest
from sklearn.ensemble import RandomForestRegressor
regressor_RF = RandomForestRegressor(n_estimators = 9,max_depth = 5).fit(X_train,y_train)
ypred_RF = regressor_RF.predict(X_test)

In [50]:
#R-squared and rms value for Random Forest
print('R-squared train score: {:.2f}'.format(regressor_RF.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(regressor_RF.score(X_test,y_test)))
print('Root mean squared error(Random Forest): {:.2f}'.format(np.sqrt(mean_squared_error(y_test, ypred_RF))))

R-squared train score: 0.91
R-squared test score: 0.90
Root mean squared error(Random Forest): 10.29


In [51]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = regressor_RF, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.9014197135027266

In [48]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':range(1,11),'max_depth' : range(1,11)}
grid_search = GridSearchCV(estimator = regressor_RF,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.9014034228604991
{'max_depth': 5, 'n_estimators': 9}


In [58]:
#Fitting Extra Trees Regression
from sklearn.ensemble import ExtraTreesRegressor
regressor_ETR = ExtraTreesRegressor(n_estimators = 350,max_depth = 6).fit(X_train,y_train)
ypred_ETR = regressor_ETR.predict(X_test)

In [59]:
#R-squared and rms value for Extra Tree
print('R-squared train score: {:.2f}'.format(regressor_ETR.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(regressor_ETR.score(X_test,y_test)))
print('Root mean squared error(Extra Tree): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_ETR))))

R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(Extra Tree): 10.40


In [60]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = regressor_ETR, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.8986407203385008

In [55]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[150,200,250,300,350,400],'max_depth' : [4,5,6,7,8]}
grid_search = GridSearchCV(estimator = regressor_ETR,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.9016462400309418
{'max_depth': 8, 'n_estimators': 350}


In [78]:
#Ridge Regression
from sklearn.linear_model import Ridge
linear_Ridge = Ridge(alpha = 1).fit(X_train,y_train)
ypred_Ridge = linear_Ridge.predict(X_test)

In [79]:
#R-squared and rms value for Ridge Regression
print('R-squared train score: {:.2f}'.format(linear_Ridge.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(linear_Ridge.score(X_test,y_test)))
print('Root mean squared error(Ridge): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_Ridge))))

R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(Ridge): 10.12


In [80]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = linear_Ridge, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.9026762005024486

In [77]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'alpha':range(1,6)}
grid_search = GridSearchCV(estimator = linear_Ridge,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.9026762005024485
{'alpha': 1}


In [81]:
#Lasso Regression
from sklearn.linear_model import Lasso
linear_Lasso = Lasso(alpha = 1).fit(X_train,y_train)
ypred_Lasso = linear_Lasso.predict(X_test)

In [82]:
#R-squared and rms value for Lasso Regression
print('R-squared train score: {:.2f}'.format(linear_Lasso.score(X_train,y_train)))
print('R-squared test score: {:.2f}'.format(linear_Lasso.score(X_test,y_test)))
print('Root mean squared error(Lasso): {:.2f}'.format(np.sqrt(mean_squared_error(y_test,ypred_Lasso))))

R-squared train score: 0.90
R-squared test score: 0.90
Root mean squared error(Lasso): 10.43


In [83]:
#Applying k-fold cross validation
from sklearn.model_selection import cross_val_score
r_square = cross_val_score(estimator = linear_Lasso, X = X_train, y = y_train, cv = 10)
r_square.mean()

0.8964184600301351

In [76]:
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters = {'alpha':range(1,6)}
grid_search = GridSearchCV(estimator = linear_Lasso,
                           param_grid = parameters,
                           scoring = 'r2',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.896418460030135
{'alpha': 1}
