<a href="https://colab.research.google.com/github/Andrea227/dailyexercise/blob/master/Project_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# First regression model--Random forest regression

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV

In [0]:
# set up the data drame and parameters
data = pd.read_csv('https://raw.githubusercontent.com/Andrea227/dailyexercise/master/Admission_Predict_Ver1.1.csv')

In [4]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [0]:
# Set up the data
X = data.iloc[:,1:7].values
y = data.iloc[:,8].values

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 50)

In [0]:
# Preprocessing
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 12, 23, 34, 45, 56, 67, 78, 89, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [27]:
# Check for the best parameters of regressor
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions = random_grid, cv=10)
rf_random.fit(X_train,y_train)
print(rf_random.best_params_)

{'n_estimators': 1400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}


In [10]:
# Train the model
ranforreg = RandomForestRegressor(n_estimators=1400, criterion = "mae", min_samples_leaf = 1, max_features = "sqrt", random_state=50, bootstrap=True, min_samples_split=10) 
ranforreg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=1400, n_jobs=None, oob_score=False,
                      random_state=50, verbose=0, warm_start=False)

In [11]:
# Test the model and see how it fits the regression
predictions = ranforreg.predict(X_test)
print(r2_score(y_test, predictions))

0.8354641026930251


In [0]:
def evaluate(model, X, y):
    predictions = model.predict(X)
    error = abs(predictions - y)
    meanerror = 100 * np.mean(error / y)
    accuracy = 100 - meanerror
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(error)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [13]:
evaluate(ranforreg,X_test,y_test)

Model Performance
Average Error: 0.0413 degrees.
Accuracy = 93.53%.


93.53062711937129

# Second Linear regression model

In [0]:
from sklearn.linear_model import LinearRegression

In [15]:
linear = LinearRegression()
linear.fit(X_train, y_train)
prediction2 = linear.predict(X_test)
print(r2_score(y_test, prediction2))

0.8348234813669918


In [16]:
evaluate(linear,X_test,y_test)

Model Performance
Average Error: 0.0416 degrees.
Accuracy = 93.55%.


93.55157938480586

# 3. Third Decision tree

In [0]:
from sklearn.tree import DecisionTreeRegressor

In [0]:
dtree = DecisionTreeRegressor()

In [0]:
# criterion
criterion = ['mse', 'friedman_mse', 'mae']
# splitter
splitter = ["best", "random"]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 30, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]
# min_weight_fraction_leaf
min_weight_fraction_leaf = [0, 0.1, 0.2, 0.3, 0.4]
# Create the random grid
random_grid = {'criterion': criterion,
               'splitter': splitter,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               "min_weight_fraction_leaf": min_weight_fraction_leaf}

In [51]:
# find the best parameters
print(random_grid)

{'criterion': ['mse', 'friedman_mse', 'mae'], 'splitter': ['best', 'random'], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 12, 23, 34, 45, 56, 67, 78, 89, 100, None], 'min_samples_split': [2, 5, 10, 20, 30, 40], 'min_samples_leaf': [1, 2, 4, 6, 8, 10], 'min_weight_fraction_leaf': [0, 0.1, 0.2, 0.3, 0.4]}


In [60]:
dtree_random = RandomizedSearchCV(estimator=dtree, param_distributions = random_grid, cv=10)
dtree_random.fit(X_train,y_train)
print(dtree_random.best_params_)

{'splitter': 'best', 'min_weight_fraction_leaf': 0, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': 'auto', 'max_depth': 23, 'criterion': 'mae'}


In [72]:
dt = DecisionTreeRegressor(criterion="mae", splitter="best", min_samples_split=10, min_samples_leaf=8, max_depth=23, min_weight_fraction_leaf=0, max_features="auto")
dt.fit(X_train, y_train)
prediction3 = dt.predict(X_test)
print(r2_score(y_test, prediction3))

0.7848506888497766


In [62]:
evaluate(dt,X_test,y_test)

Model Performance
Average Error: 0.0486 degrees.
Accuracy = 92.37%.


92.37217241615241

# 4. KNN Regressor

In [0]:
from sklearn.neighbors import KNeighborsRegressor

In [34]:
KNN = KNeighborsRegressor(n_neighbors = 8)
KNN.fit(X_train, y_train)
prediction4 = KNN.predict(X_test)
print(r2_score(y_test, prediction4))

0.8342796309656427


In [35]:
evaluate(KNN,X_test,y_test)

Model Performance
Average Error: 0.0404 degrees.
Accuracy = 93.68%.


93.67689932611805

# 5. 5-Fold cross valiadation

In [40]:
results = cross_validate(ranforreg, X_train, y_train, cv=5)
print(results['test_score'])
print("The performance of random forest is ", sum(results['test_score'])/5)

[0.7290652  0.77147667 0.80632913 0.78278231 0.8465958 ]
The performance of random forest is  0.7872498229559046


In [42]:
results = cross_validate(linear, X_train, y_train, cv=5)
print(results['test_score'])
print("The performance of linear is ", sum(results['test_score'])/5)

[0.73235985 0.77791217 0.82394596 0.80360678 0.84581212]
The performance of linear is  0.7967273744699732


In [63]:
results = cross_validate(dt, X_train, y_train, cv=5)
print(results['test_score'])
print("The performance of decision tree is ", sum(results['test_score'])/5)

[0.66277333 0.72169454 0.72540551 0.69102248 0.77700856]
The performance of decision tree is  0.7155808851606589


In [46]:
results = cross_validate(KNN, X_train, y_train, cv=5)
print(results['test_score'])
print("The performance of KNN is ", sum(results['test_score'])/5)

[0.71481011 0.75665702 0.7903297  0.77894053 0.76744171]
The performance of KNN is  0.7616358154048867


In the final evaluation, we are using 5-fold cross valiadation to check the performance of each model. By the result, we could see that performance of linear regressor model has the best accuracy and fit score than others. 

In both random forest model and decision tree model, I used random search to find the possible best parameters and change it accordingly to improve the performance of the model.

In addition, except for using R2-score to test the fitting of the models predications and fitting. I also use a evaluation to test the accuracy of predications and targets.