### We will use already processed data to predict interest rate by stacking and blending.
* In this example we will divide the data in 20 sets. That can be any number. ( N )
* 1st Layer Model : Linear Regression, Decision Tree, KNN, CatBoost ( These can be any regressor , you need to import the function only)
* 2nd Layer Model : CatBoost ( Again this can be any method)
* To specify any parameters we have to ingest it directly in the model argument
* Finally we will wrap the entire process in a single function

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor, Pool

x_train = pd.read_csv("C:\\Users\\chakr\\Desktop\\Clean_data\\X_train_reg.csv")
y_train = pd.read_csv("C:\\Users\\chakr\\Desktop\\Clean_data\\y_train_reg.csv")

x_train.head()

Unnamed: 0,ID,Amount.Requested,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,ll_36,lp_10,lp_11,...,fico,Employment.Length_7years,Employment.Length_6years,Employment.Length_1year,Employment.Length_4years,Employment.Length_5years,Employment.Length_3years,Employment.Length_2years,Employment.Length_LT_1year,Employment.Length_10years
0,79542,25000.0,27.56,8606.56,11.0,15210.0,3.0,0.0,0.0,0,...,722,0,0,0,0,1,0,0,0,0
1,75473,19750.0,13.39,6737.5,14.0,19070.0,3.0,0.0,0.0,0,...,712,0,0,0,1,0,0,0,0,0
2,67265,2100.0,3.5,1000.0,13.0,893.0,1.0,1.0,0.0,1,...,692,0,0,0,0,0,0,0,1,0
3,80167,28000.0,19.62,7083.33,12.0,38194.0,1.0,1.0,0.0,0,...,712,0,0,0,0,0,0,0,0,1
4,17240,24250.0,23.79,5833.33,6.0,31061.0,2.0,0.0,0.0,0,...,732,0,0,0,0,0,0,0,0,1


In [2]:
from sklearn.model_selection import train_test_split
x_train1, x_train2, y_train1, y_train2 = train_test_split( x_train, y_train, test_size=0.25, random_state=42)

## Step -1 : Divide the Datasets in N parts ( here we use 20 Parts)
* Leaving One chunk out to get N train datasets
* The other chunk will be test datasets

In [3]:
def get_dataset(x_train,y_train,N=5) :
    merge = pd.concat([x_train,y_train],axis=1)
    merge = merge.sample(frac=1, random_state=1).reset_index(drop=True)
    y_train = merge.iloc[:,(merge.shape[1]-1):(merge.shape[1])]
    x_train = merge.iloc[:,0:(merge.shape[1]-1)]
    
    z = int(len(x_train)/N)
    start = [0]
    stop = []
    for i in range(1,N):
        start.append(z*i)
        stop.append(z*i)
    stop.append(len(x_train))
    
    c = list()
    train_data = list()
    test_data = list()
    y_data = list()
    for i in range(0,N):
        c=list(range(start[i],stop[i]))
        train_data.append(x_train.iloc[[k for k in range(0,len(x_train)) if k not in c],:])
        y_data.append(y_train.iloc[[k for k in range(0,len(y_train)) if k not in c],:])
        test_data.append(x_train.iloc[c,:])
        
    return(train_data,y_data,test_data,y_train)


In [4]:
datasets = get_dataset(x_train1,y_train1,20)

train_data = datasets[0]
y_data = datasets[1]
test_data = datasets[2]
final_y =  datasets[3]

## Step 2 : Define the first layer models and assign a code for each model
### We can assign hyperparameters too inside the model

In [5]:
models = [LinearRegression(),
          DecisionTreeRegressor(),
          KNeighborsRegressor(),
          CatBoostRegressor(logging_level ='Silent')]
code = ['lin_reg','dtree_reg','Knn_reg','cat_reg']

## Step 3 : Prediction function for all the models together

In [6]:
def stack(x_train, y_train , x_test , models,code):
    
    def flatten_list(_2d_list):
        flat_list = []
        for element in _2d_list:
            if type(element) is list:
                for item in element:
                    flat_list.append(item)
            else:
                flat_list.append(element)
        return flat_list

    result = list()
    for i in list(range(len(models))):
        reg = models[i]
        reg.fit(x_train,y_train)
        test_pred = flatten_list(reg.predict(x_test).tolist())
        result.append(test_pred)

    result_df = pd.DataFrame()    
    for i in list(range(len(code))):
        result_df[code[i]] = result[i]
    return result_df

## Step 4 : Predict for each of the chunk to get the final Data Frame

In [7]:
final_df = pd.DataFrame(columns = code)
for i in range(0,len(train_data)):
    current_df = stack(train_data[i],y_data[i],test_data[i],models,code)
    final_df = pd.concat([final_df,current_df])

In [8]:
final_test = stack(x_train1,y_train1,x_train2,models,code)

In [9]:
final_df.head()

Unnamed: 0,lin_reg,dtree_reg,Knn_reg,cat_reg
0,14.1258,14.27,14.546,14.54909
1,7.050994,8.63,13.024,7.629805
2,8.618487,9.76,15.996,10.058048
3,8.682994,6.92,12.214,6.836347
4,16.81051,15.27,14.796,16.67652


In [10]:
final_test.head()

Unnamed: 0,lin_reg,dtree_reg,Knn_reg,cat_reg
0,17.776893,21.49,16.552,18.716395
1,4.362837,6.62,13.986,5.959875
2,7.189592,7.9,16.352,7.95427
3,12.189007,12.87,9.504,12.234266
4,19.137452,19.72,13.826,20.042419


## Step 5 : Build the second Layer Model

In [11]:
reg2 = CatBoostRegressor(logging_level ='Silent')
reg2.fit(final_df,final_y)
test_pred = reg2.predict(final_test)

In [12]:
mean_squared_error(test_pred,y_train2)**0.5

1.4491139503764403

# Step 6: Wrap everything in a function

In [13]:
def stackblend_reg(x_train,y_train,x_test,models,code,N=20,final_layer=LinearRegression()):
    
    def get_dataset(x_train,y_train,N=5) :
        merge = pd.concat([x_train,y_train],axis=1)
        merge = merge.sample(frac=1, random_state=1).reset_index(drop=True)
        y_train = merge.iloc[:,(merge.shape[1]-1):(merge.shape[1])]
        x_train = merge.iloc[:,0:(merge.shape[1]-1)]

        z = int(len(x_train)/N)
        start = [0]
        stop = []
        for i in range(1,N):
            start.append(z*i)
            stop.append(z*i)
        stop.append(len(x_train))

        c = list()
        train_data = list()
        test_data = list()
        y_data = list()
        for i in range(0,N):
            c=list(range(start[i],stop[i]))
            train_data.append(x_train.iloc[[k for k in range(0,len(x_train)) if k not in c],:])
            y_data.append(y_train.iloc[[k for k in range(0,len(y_train)) if k not in c],:])
            test_data.append(x_train.iloc[c,:])

        return(train_data,y_data,test_data,y_train)
    
    datasets = get_dataset(x_train,y_train,N)
    train_data = datasets[0]
    y_data = datasets[1]
    test_data = datasets[2]
    final_y =  datasets[3]
    
    def stack(x_train, y_train , x_test , models=models,code=code):
    
        def flatten_list(_2d_list):
            flat_list = []
            for element in _2d_list:
                if type(element) is list:
                    for item in element:
                        flat_list.append(item)
                else:
                    flat_list.append(element)
            return flat_list

        result = list()
        for i in list(range(len(models))):
            reg = models[i]
            reg.fit(x_train,y_train)
            test_pred = flatten_list(reg.predict(x_test).tolist())
            result.append(test_pred)

        result_df = pd.DataFrame()    
        for i in list(range(len(code))):
            result_df[code[i]] = result[i]
        return result_df
    final_df = pd.DataFrame(columns = code)
    
    for i in range(0,len(train_data)):
        current_df = stack(train_data[i],y_data[i],test_data[i],models,code)
        final_df = pd.concat([final_df,current_df])
        
    final_test = stack(x_train,y_train,x_test,models,code)
    
    reg2 = final_layer
    reg2.fit(final_df,final_y)
    test_pred = reg2.predict(final_test)
    
    return test_pred

In [14]:
stack_pred = stackblend_reg(x_train1,y_train1,x_train2,
                            models = [LinearRegression(),
                                      DecisionTreeRegressor(),
                                      KNeighborsRegressor(),
                                      CatBoostRegressor(logging_level ='Silent')],
                            code = ['lin_reg','dtree_reg','Knn_reg','cat_reg'],N=20,
                            final_layer=CatBoostRegressor(logging_level ='Silent'))

In [15]:
mean_squared_error(stack_pred,y_train2)**0.5

1.462081225360224