<a href="https://colab.research.google.com/github/Eyanye/Eyanye/blob/main/LR%2BRFR%2BSVR%2B3datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Importing dataset and examining it
dataset = pd.read_csv("/content/CarResale.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

# Converting Categorical features into Numerical features
dataset['fuel'] = dataset['fuel'].map({'Diesel': 1, 'Petrol':0})
dataset['seller_type'] = dataset['seller_type'].map({'Dealer': 1, 'Individual':0})
dataset['transmission'] = dataset['transmission'].map({'Automatic': 1, 'Manual':0})
dataset['owner'] = dataset['owner'].map({'Fourth & Above Owner': 4, 'Third Owner': 3, 'Second Owner': 2, 'First Owner':1, 'Test Drive Car':0})
print(dataset.info())

# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# # Dividing dataset into label and feature sets
X = dataset.drop('engine_size', axis = 1) # Features
Y = dataset['selling_price'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Linear Regression with Regularization
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter', along with the regularization parameter alpha using Grid Search
sgdr = SGDRegressor(random_state = 1, penalty = 'elasticnet')
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[10000, 20000, 30000, 40000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [0,0.25,0.5,0.75,1]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(6240-1)/(6240-9-1)
print("Adjusted r2: ", Adj_r2)

# '''
# Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

# where, n = number of observations, p = number of features
# '''

best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)

print(pd.DataFrame(zip(X.columns, best_model.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

   year  km_driven    fuel seller_type transmission         owner  seats  \
0  2014     145500  Diesel  Individual       Manual   First Owner      5   
1  2014     120000  Diesel  Individual       Manual  Second Owner      5   
2  2006     140000  Petrol  Individual       Manual   Third Owner      5   
3  2010     127000  Diesel  Individual       Manual   First Owner      5   
4  2007     120000  Petrol  Individual       Manual   First Owner      5   

   mileage  engine_size  brake_horsepower  selling_price  
0    23.40         1248             74.00         4500.0  
1    21.14         1498            103.52         3700.0  
2    17.70         1497             78.00         1580.0  
3    23.00         1396             90.00         2250.0  
4    16.10         1298             88.20         1300.0  
(7800, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7800 entries, 0 to 7799
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            -

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(7800, 10)
(7800,)
Best parameters:  {'alpha': 0.001, 'eta0': 0.1, 'l1_ratio': 1, 'max_iter': 10000}
r2:  0.9999999999999922
Adjusted r2:  0.9999999999999922
Intercept:  [6538.87985927]
           Features  Coefficients
9     selling_price   8170.038203
0              year      0.000000
1         km_driven      0.000000
2              fuel      0.000000
3       seller_type      0.000000
4      transmission      0.000000
5             owner      0.000000
6             seats      0.000000
7           mileage      0.000000
8  brake_horsepower      0.000000


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Importing dataset and examining it
dataset = pd.read_csv("/content/LimitPrediction.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

# Converting Categorical features into Numerical features
dataset['Gender'] = dataset['Gender'].map({' Male': 1, 'Female':0})
dataset['Student'] = dataset['Student'].map({'Yes': 1, 'No':0})
dataset['Married'] = dataset['Married'].map({'Yes': 1, 'No':0})
print(dataset.info())

categorical_features = ['Ethnicity']
final_data = pd.get_dummies(dataset, columns = categorical_features, drop_first= True)
print(final_data.info())
print(final_data.head(2))

# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# Dividing dataset into label and feature sets
X = final_data.drop(['Customer Id', 'Limit'], axis = 1) # Features
Y = dataset['Limit'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Linear Regression with Regularization
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter', along with the regularization parameter alpha using Grid Search
sgdr = SGDRegressor(random_state = 1, penalty = 'elasticnet')
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[10000, 20000, 30000, 40000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [0.25,0.5,0.75]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(320-1)/(320-11-1)
print("Adjusted r2: ", Adj_r2)

# '''
# Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

# where, n = number of observations in training data, p = number of features
# '''

best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)

print(pd.DataFrame(zip(X.columns, best_model.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

##################################################################################
# Implementing Random Forest Regression
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [10,20,30,40,50,100]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(320-1)/(320-11-1)
print("Adjusted r2: ", Adj_r2)

# '''
# Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

# where, n = number of observations in training data, p = number of features
# '''

featimp = pd.Series(gd_sr.best_estimator_.feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)

# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['Rating', 'Balance', 'Income']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [50,100,150,200,250]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(320-1)/(320-11-1)
print("Adjusted r2: ", Adj_r2)

   Customer Id   Income  Limit  Rating  Cards  Age  Education  Gender Student  \
0            1   14.891   3606     283      2   34         11    Male      No   
1            2  106.025   6645     483      3   82         15  Female     Yes   
2            3  104.593   7075     514      4   71         11    Male      No   
3            4  148.924   9504     681      3   36         11  Female      No   
4            5   55.882   4897     357      2   68         16    Male      No   

  Married  Ethnicity  Balance  
0     Yes  Caucasian      333  
1     Yes      Asian      903  
2      No      Asian      580  
3      No      Asian      964  
4     Yes  Caucasian      331  
(400, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Customer Id  400 non-null    int64  
 1   Income       400 non-null    float64
 2   Limit        400 non-null    int6

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(400, 11)
(400,)



Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached 

Best parameters:  {'alpha': 0.001, 'eta0': 0.001, 'l1_ratio': 0.75, 'max_iter': 30000}
r2:  0.9955842505322294
Adjusted r2:  0.9954265451940948
Intercept:  [4735.59983687]
               Features  Coefficients
1                Rating   1979.353648
8               Balance    237.263745
0                Income    149.948179
4             Education     12.374549
9       Ethnicity_Asian      6.310812
3                   Age      3.008162
10  Ethnicity_Caucasian     -1.342903
5                Gender     -2.709109
7               Married    -12.963681
6               Student    -79.361698
2                 Cards   -101.222528
Best parameters:  {'n_estimators': 100}
r2:  0.976078072899379
Adjusted r2:  0.975223718360071
Rating                 0.489218
Balance                0.243054
Income                 0.200662
Age                    0.028237
Education              0.012596
Cards                  0.011347
Student                0.004760
Ethnicity_Caucasian    0.002867
Gender               

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Importing dataset and examining it
dataset = pd.read_csv("/content/CompletionRates.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

# # Converting Categorical features into Numerical features
# dataset['CentralAir'] = dataset['CentralAir'].map({'Y': 1, 'N':0})
# dataset['PavedDrive'] = dataset['PavedDrive'].map({'Y': 1, 'N':0})
# print(dataset.info())

categorical_features = ['INST_TYPE']
final_data = pd.get_dummies(dataset, columns = categorical_features, drop_first= True)
print(final_data.info())
print(final_data.head(2))

# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# Dividing dataset into label and feature sets
X = dataset.drop(['AVG_FAC_SAL_PM','TS_MEN','TS_MAR','ST_FI_LO','ST_FI_M1','ST_FI_M2','ST_FI_H1','ST_FI_H2','HECR', 'HL_ED_P_PS'], axis = 1) # Features
Y = dataset['HECR'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

# # Plotting Correlation Heatmap
# corrs = X.corr()
# figure = ff.create_annotated_heatmap(
#     z=corrs.values,
#     x=list(corrs.columns),
#     y=list(corrs.index),
#     annotation_text=corrs.round(2).values,
#     showscale=True)
# figure.show()

# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Linear Regression with Regularization
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter', along with the regularization parameter alpha using Grid Search
sgdr = SGDRegressor(random_state = 1, penalty = 'elasticnet')
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[10000, 20000, 30000, 40000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [0,0.25,0.5,0.75,1]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(11440-1)/(11440-11-1)
print("Adjusted r2: ", Adj_r2)

'''
Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

where, n = number of observations in training data, p = number of features
'''

best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)

print(pd.DataFrame(zip(X.columns, best_model.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

##################################################################################
# Implementing Random Forest Regression
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [10,20,30,40,50,100]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(11440-1)/(11440-11-1)
print("Adjusted r2: ", Adj_r2)

'''
Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

where, n = number of observations in training data, p = number of features
'''

featimp = pd.Series(gd_sr.best_estimator_.feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)

# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['HL_ED_P_HS', 'PER_PT', 'INST_EXP_PS', 'TS', 'S_DPEN']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [10,20,30,50,100]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(11440-1)/(11440-11-1)
print("Adjusted r2: ", Adj_r2)

'''
Adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

where, n = number of observations in training data, p = number of features
'''
####################################################################################
# Implementing Support Vector Regression
# Tuning the SVR parameters 'kernel', 'C', 'epsilon' and implementing cross-validation using Grid Search
svr = SVR()
grid_param = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [.001,.01, 0.1, 1, 10,100]}

gd_sr = GridSearchCV(estimator=svr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("r2: ", best_result)

Adj_r2 = 1-(1-best_result)*(11440-1)/(11440-11-1)
print("Adjusted r2: ", Adj_r2)

   HIGH_DEG  INST_TYPE  INST_EXP_PS  AVG_FAC_SAL_PM     TS  TS_MEN  TS_WOM  \
0         4          1         3212            4587   3903  0.4766  0.5234   
1         4          1        10418            6398   9562  0.4195  0.5805   
2         4          1         5784            5859   4165  0.4898  0.5102   
3         4          1         5824            6109  14245  0.4795  0.5205   
4         4          1         4622            6102  18520  0.5212  0.4788   

     TS_MAR   TS_DPEN    TS_VET  PER_PT  ST_FI_LO  ST_FI_M1  ST_FI_M2  \
0  0.056698  0.821877  0.018063  0.0699  0.504265  0.172604  0.136979   
1  0.149349  0.590745  0.039623  0.3095  0.490163  0.155722  0.144638   
2  0.221490  0.551088  0.048780  0.3160  0.440343  0.156229  0.154252   
3  0.065476  0.815247  0.024038  0.1046  0.331731  0.133242  0.156364   
4  0.061376  0.825645  0.025997  0.0821  0.272478  0.126271  0.150899   

   ST_FI_H1  ST_FI_H2  HL_ED_P_MS  HL_ED_P_HS  HL_ED_P_PS    HECR  
0  0.110386  0.075765   

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(14300, 10)
(14300,)
