<a href="https://colab.research.google.com/github/Amarmurun0212/Diver/blob/main/EnsembleLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import random
import math

Preparing the dataset

In [14]:
df = pd.read_csv("train.csv")
X = df[["GrLivArea", "YearBuilt"]].values
y = df["SalePrice"].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
print("linear regression MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))

linear regression MSE : 2495554898.6683216


In [19]:
svm = SVR()
svm.fit(X_train, y_train)
svm_y_pred = svm.predict(X_test)
print("SVM MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))

SVM MSE : 7844111028.863974


In [21]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_y_pred = tree.predict(X_test)
print("decision tree MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))

decision tree MSE : 2077147133.3565447


**[Problem 1] Scratch implementation of blending**

In [22]:
def blend(X_train, X_test, y_train, model1, model2):
    model1.fit(X_train, y_train)
    model1_pred = model1.predict(X_test)
    
    model2.fit(X_train, y_train)
    model2_pred = model2.predict(X_test)
    y_pred = (model1_pred + model2_pred) / 2
    return y_pred

In [23]:
y_pred = blend(X_train, X_test, y_train, lr, tree)
print("   linear regression     MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("      SVM       MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("    decision tree      MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("linear regression + decision tree MSE : {}".format(mean_squared_error(y_test, y_pred)))

   linear regression     MSE : 2495554898.6683216
      SVM       MSE : 7844111028.863974
    decision tree      MSE : 2077147133.3565447
linear regression + decision tree MSE : 1845137211.190017


In [24]:
y_pred = blend(X_train, X_test, y_train, svm, tree)
print("linear regression   MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("SVM        MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("decision tree    MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("SVM+decision tree MSE : {}".format(mean_squared_error(y_test, y_pred)))

linear regression   MSE : 2495554898.6683216
SVM        MSE : 7844111028.863974
decision tree    MSE : 2077147133.3565447
SVM+decision tree MSE : 3384718539.983163


In [25]:
model1 = SVR(gamma='scale', kernel='linear')
model2 = SVR(gamma='scale') 
y_pred = blend(X_train, X_test, y_train, model1, model2)
print("   linear regression     MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("      SVM       MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("    decision tree      MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("SVM linear+polynomial MSE : {}".format(mean_squared_error(y_test, y_pred)))

   linear regression     MSE : 2495554898.6683216
      SVM       MSE : 7844111028.863974
    decision tree      MSE : 2077147133.3565447
SVM linear+polynomial MSE : 4450951532.463163


**[Problem 2] Scratch implementation of bagging**

In [26]:
def bagging(X_train, X_test, y_train, model, n=2):
    y_pred = np.zeros(X_test.shape[0])
    for i in range(n):
        X_divided = train_test_split(X_train, random_state=random.randint(0, i))[0]
        y_divided = train_test_split(X_train, random_state=random.randint(0, i))[0]
        model.fit(X_train, y_train)
        y_pred += model.predict(X_test)
    y_pred = y_pred / n
    return y_pred

y_pred = bagging(X_train, X_test, y_train, tree, n=100)

print("    決定木      MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("bagging 決定木  MSE : {}".format(mean_squared_error(y_test, y_pred)))

    決定木      MSE : 2077147133.3565447
bagging 決定木  MSE : 2153320529.8075447


**[Problem 3] Scratch implementation of stacking**

In [27]:
class Stacking():
    def __init__(self, split_n=3, model_n=2):
        self.split_n = split_n
        self.model_n = model_n

    def fit(self, X_train, y_train, X_test, y_test, models):
        # K個に分割するdividerを作る
        divider = np.zeros(self.split_n)
        vol = X_train.shape[0]
        num = self.split_n
        for i in range(self.split_n):
            divider[i] = math.ceil(vol/num)
            num -= 1
            vol = vol-divider[i]
        
        self.divider = divider.astype(int)
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.models = models
        print(self.divider)
    
    def predict(self, X_test):
        for m in range(self.model_n):
            divide_point = 0
            for n in range(self.split_n):
                idx = np.zeros(X_train.shape[0], dtype=bool)
                idx[divide_point:divide_point+self.divider[n]]= True
                self.X_test_divided = X_train[idx, :]
                self.X_train_divided = X_train[~idx, :]
                self.y_test_divided = y_train[idx]
                self.y_train_divided = y_train[~idx]                    
                
                models[m].fit(self.X_train_divided, self.y_train_divided)
                if n == 0:
                    blend = models[m].predict(self.X_test_divided)
                    pred_data = models[m].predict(X_test)
                else:
                    blend = np.r_[blend, models[m].predict(self.X_test_divided)]
                    pred_data = np.c_[pred_data, models[m].predict(X_test)]
            
                divide_point += self.divider[n]
            if m ==0:
                blend_data =blend.reshape(-1, 1)
                blend_pred_data = np.mean(pred_data, axis=1)
            else:
                blend_data = np.c_[blend_data, blend.reshape(-1, 1) ]
                blend_pred_data = np.c_[blend_pred_data, np.mean(pred_data, axis=1)]
        
        models[0].fit(blend_data, y_train)
        y_pred = models[0].predict(blend_pred_data)                               
        return y_pred

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
models = [LinearRegression(), DecisionTreeRegressor()]

stacking = Stacking()
stacking.fit(X_train, y_train, X_test, y_test, models)
y_pred = stacking.predict(X_test)

print("linear regression MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("SVM      MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("decision tree   MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("Stacking MSE : {}".format(mean_squared_error(y_test, y_pred)))

[390 389 389]
linear regression MSE : 2495554898.6683216
SVM      MSE : 7844111028.863974
decision tree   MSE : 2077147133.3565447
Stacking MSE : 1955609479.2295792
