# ⚡️Summary ⚡️

In this notebook we look to Tune our selected models, with the features selected in notebook 3

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 

import gc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss, mean_squared_error , mean_absolute_error, r2_score
from sklearn.model_selection import KFold,  RepeatedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer

from sklearn.linear_model import Ridge, LinearRegression, HuberRegressor, Lasso,LassoCV,SGDRegressor

import optuna
import shap
import lightgbm as lgb
import catboost as cat
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [None]:
#suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Parameters 

In [None]:
target = "price" #Target column that we will be predicting, this is here for quick reference 

# Different scaling options
SCALING = False
SCALER = QuantileTransformer(output_distribution = "normal") #RobustScaler()

TEST_ON_GENERATED_ONLY = False
ADD_DATA= True
EPOCHS= 2000

SYM1_ONLY = True

MADE_REPLACE = True  ## test with different years (submission only)
CITYCODE_ENC = False  # Catboost try categical cols
OUTLIERS_DROP= False

NUM_FOLDS = 5
NUM_SPLITS = 3

SHAP_VALS = False
CALIBRATION = "linear"

name = 'LinearRegression' #LinearRegression #rf #ET  ----for LR turn off 'Test_on_Generated'

In [None]:
# Notebook settings
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)

## 💾 Load Data Data 💾

In [None]:
df_train = pd.read_csv("/kaggle/input/symbolic-features/all_features_trn_138323.89 (2).csv", index_col = 0)
df_test = pd.read_csv("/kaggle/input/symbolic-features/all_features_tst_138323.89 (2).csv", index_col = 0)
sub = pd.read_csv("/kaggle/input/playground-series-s3e6/sample_submission.csv",index_col = 0)

In [None]:
if SYM1_ONLY:
    drop_sym = [col for col in df_train.columns  if "sym" in col  and col != "sym_1"]
    df_train.drop(drop_sym, axis =1, inplace = True)
    df_test.drop(drop_sym, axis =1, inplace = True)

In [None]:
if not ADD_DATA:
    df_train = df_train[df_train['is_generated']==1].drop('is_generated',axis=1)
    df_test =  df_test[df_test['is_generated']==1].drop('is_generated',axis=1)
df_train

## Helper functions 

In [None]:
def base_model(X_trn,y_trn,X_tst,y_tst, name,model_in, X_in, test_in, print_scores = True):
    
    if name in ["lightgbm"]:
        model_in.fit(X_trn,y_trn,
              eval_set=[(X_tst,y_tst)],
              callbacks= [lgb.log_evaluation(-1), lgb.early_stopping(30)])
        
    elif name in ["xgboost"]:
        model_in.fit(X_trn,y_trn,
                     eval_set=[(X_tst,y_tst)],
                  verbose= 0
                 )
    elif name in ["catboost"]:
        model_in.fit(X_trn,y_trn,
                  eval_set=[(X_tst,y_tst)],
                  early_stopping_rounds=30,
                  verbose= 0
                 )          
    else:
        model_in.fit(X_trn,y_trn)

    val_preds = model_in.predict(X_tst)
    trn_preds = model_in.predict(X_in)
    test_preds = model_in.predict(test_in)
    
    score_trn =mean_squared_error(y, trn_preds)**0.5
    score_val = mean_squared_error(y_tst, val_preds )**0.5
    
    if print_scores:
        print("\nTrn Score:",score_trn)
        print("Val Score:",score_val)
        #print("Val logloss", log_loss(y,trn_preds_base))
    
    return test_preds, trn_preds, val_preds, score_trn, score_val, model_in

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
def detect_outliers(data):
    outlier_percents = {}
    for column in data.columns:
        q1 = np.quantile(data[column], 0.25)
        q3 = np.quantile(data[column], 0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
        outlier_percent = (outliers.shape[0] / data.shape[0]) * 100
        outlier_percents[column] = outlier_percent
        
    outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage')

# 📃 Quick Analysis 📃

In [None]:
plt.figure(figsize = (25,12))

corr = pd.concat((df_train, df_test), axis =0).corr()
upper_triangle = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr,vmin = -1, vmax = 1, cmap = "Spectral", annot = True, mask = upper_triangle)
plt.title("Correlation of all features and target", fontsize= 18)
plt.show()

In [None]:
fig,ax = plt.subplots(1,3 ,figsize =(25,7))
sns.scatterplot(x= df_train["squareMeters"], y = df_train["price"],ax = ax[0], color = "b")
sns.scatterplot(x= df_train["sym_1"], y = df_train["price"],ax = ax[1], color="r")
sns.scatterplot(x= df_train["squareMeters"], y = df_train["sym_1"],ax = ax[2], color = "g")
ax[0].set_title("SquareMeters vs Price")
ax[1].set_title("sym_1 vs price")
ax[2].set_title("SquareMeters vs sym_1")
fig.suptitle("Relationship plots of correlated features and target", fontsize = 15)
plt.show()

# 🎯 Feature Engineering 🎯
* Here we create features and process our data, by steps that were identified in our EDA

In [None]:
df_trn = df_train.copy(deep = True)
df_tst = df_test.copy(deep = True)
df_trn

In [None]:
print("Duplicated rows:",df_trn[df_trn.duplicated()][target].value_counts())
df_trn.drop_duplicates(inplace = True,ignore_index  = True)
df_tst.drop_duplicates(inplace = True,ignore_index  = True)
print(df_trn.duplicated().sum())

In [None]:
def Additional_Features(df_in):
    df = df_in.copy(deep = True)
    
    return df

df_trn = Additional_Features(df_trn)
df_tst = Additional_Features(df_tst)
df_trn

### Made column 

In [None]:
#lets look where the square meters / price are the same in Train and test for Made==10000
df_trn[df_trn.made ==10000]

In [None]:
# print("Train data where price ==6807415.1")
# display(df_train[df_train.price ==6807415.1]) 
print("Train data where sqrMeters ==68038")
display(df_train[df_train.squareMeters ==68038]) 

print("\nTest data where sqrMeters ==68038")
display(df_test[df_test.squareMeters ==68038]) 

In [None]:
print("\nTrain data where price ==8007951.1")
display(df_train[df_train.price ==8007951.1]) 
print("Train data where sqrMeters ==80062")
display(df_train[df_train.squareMeters ==80062]) 

print("\nTest data where sqrMeters ==80062")
display(df_tst[df_tst.squareMeters ==80062])

In [None]:
if MADE_REPLACE:
    print("replacing")
    df_trn.loc[(df_trn.made ==10000)&(df_trn.squareMeters ==68038), "made"] =2000
    df_trn.loc[(df_trn.made ==10000)&(df_trn.squareMeters ==80062), "made"] =2015

df_trn[df_trn.made ==10000]

#### CITY CODE encoding

In [None]:
if CITYCODE_ENC:
    from sklearn.preprocessing import LabelEncoder
    df_original_trn = pd.read_csv("/kaggle/input/playground-series-s3e6/train.csv", index_col = 0)
    df_original_tst = pd.read_csv("/kaggle/input/playground-series-s3e6/test.csv", index_col = 0)


    if ADD_DATA:
        add_data = pd.read_csv('/kaggle/input/paris-housing-price-prediction/ParisHousing.csv')

        df_original_trn['is_generated'] = 1
        df_original_tst['is_generated'] = 1
        add_data['is_generated'] = 0

        df_original_trn = pd.concat([df_original_trn, add_data],axis=0, ignore_index=True)
        
    # label encode (ohe is worse for performance both in score and model time)
    enc = LabelEncoder()
    enc.fit(pd.concat([df_original_trn.drop(target,axis =1), df_original_tst],axis=0, ignore_index=True)["cityCode"])
    df_trn["cityCode"] = enc.transform(df_original_trn["cityCode"])
    df_tst["cityCode"] = enc.transform(df_original_tst["cityCode"])

### Drop Cols 
Below we will create sets of columns to drop for each model. These columns will be identified by trial and error as well as well as Recursive Feature Engineering 

In [None]:
all_drop_cols = []

#keep these 
cat_cols = [col for col in df_trn.columns if col not in [target]]
lgb_cols =[col for col in df_trn.columns if col not in [target]]
xgb_cols = [col for col in df_trn.columns if col not in [target]] #['squareMeters', 'numberOfRooms', 'hasYard', 'made', 'sym_1'] + ["is_generated"]
rf_cols = [col for col in df_trn.columns if col not in [target]]

#lin_cols = [col for col in df_trn.columns if col not in ["squareMeters","price"]]
lin_cols = ["sym_1","made","numberOfRooms","hasStorageRoom","hasGuestRoom", "attic", "cityPartRange", "hasStormProtector", "numPrevOwners", "hasGuestRoom", "garage"]

In [None]:
if name == "xgboost":

    df_trn.drop(all_drop_cols, axis =1, inplace = True )
    df_tst.drop(all_drop_cols, axis =1, inplace = True )
        
    df_trn = df_trn[xgb_cols+[target]]
    df_tst = df_tst[xgb_cols]
        
elif name == "lightgbm":

    df_trn = df_trn[lgb_cols+[target]]
    df_tst = df_tst[lgb_cols]

elif name in ["rf"]:
    df_trn.drop(all_drop_cols, axis =1, inplace = True)
    df_tst.drop(all_drop_cols, axis =1, inplace = True)
    
elif name in ["catboost"]:
        
        df_trn.drop(all_drop_cols, axis =1, inplace = True )
        df_tst.drop(all_drop_cols, axis =1, inplace = True )
        
        df_trn = df_trn[cat_cols+[target]]
        df_tst = df_tst[cat_cols]
        
elif name in ["LinearRegression",'Ridge', "SVR"]:
    pass
#     df_trn = df_trn[lin_cols+[target]]
#     df_tst = df_tst[lin_cols]


In [None]:
print([col for col in df_trn.columns])

### Inf and nan values
Check to see if we have any Nan or infinte values 

In [None]:
print(np.isinf(df_trn).sum().sum())#.sort_values(ascending = False))
print(np.isnan(df_trn).sum().sum()) #.sort_values(ascending = False))

### Outliers
NB for linear regression 

In [None]:
# from IPython.display import display_html 


# df1_styler = detect_outliers(df_trn).style.set_table_attributes("style='display:inline'").set_caption('Train')
# df2_styler = detect_outliers(df_tst).style.set_table_attributes("style='display:inline'").set_caption('Test')

# #display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)
# display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

In [None]:
def outlier_removal(data , i):
    q1 = np.quantile(data[i] ,0.25)
    q3 = np.quantile(data[i] , 0.75)
    iqr = q3-q1
    lower_tail = q1 - 1.5*iqr
    upper_tail = q3 + 1.5*iqr
    data.drop(data[data[i]> upper_tail].index , inplace = True)
    data.drop(data[data[i]< lower_tail].index ,inplace =True)
    
if OUTLIERS_DROP:
    outlier_list = ['made']
    for  i in outlier_list:
        outlier_removal(df_train ,i)

# 🧫 PCA Implementation 🧫
* We can also create features using PCA and Kmeans Clustering. However this can cause overfitting 
* Another purpose of PCA is to remove correlation in our data. This will help stop our model from overfitting however there can be data loss during the process 

#### NOTE: PCA is best to use cautiously as it can worsen your models performance 

In [None]:
PCA_ON= False
if PCA_ON:
    c = pd.concat([df_trn.drop(target,axis =1), df_tst]).corr().abs()
    #np.fill_diagonal(c.values, 0)
    trimask = np.triu(np.ones_like(c, dtype=bool))
    c= c.mask(trimask)
    s = c.unstack()
    so = s.sort_values(ascending=False)

    so[(so>0.5) |(so<-0.5)].head(10)

In [None]:
if PCA_ON:
    from sklearn.decomposition import PCA  

    pca_cols = ["sym_1", "squareMeters"]
    pca = PCA(n_components=1 ,whiten=True)
    df_trn_pca = pca.fit_transform(df_trn[pca_cols])
    df_tst_pca = pca.transform(df_tst[pca_cols])


    df_trn["PCA1"] = df_trn_pca
    df_tst["PCA1"] = df_tst_pca
    df_trn.drop(pca_cols,axis =1,inplace = True)
    df_tst.drop(pca_cols,axis =1,inplace = True)
df_trn

# 🔬 OPTUNA 🔬


In [None]:
X = df_trn.drop([target],axis =1)
y= df_trn[target]

In [None]:
def objective(trial,X=X,y=y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,random_state=42)
    param = {
        'booster' : 'gbtree', #, gblinear dart gbtree
        'tree_method':'hist',  #gpu_hist
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [1,2,3,4,5]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'objective' : "reg:squarederror",
        'n_estimators' : EPOCHS, 
        'early_stopping_rounds' :30,
    }
    model = xgb.XGBRegressor(**param)  
    
    test_preds_fold, trn_preds_fold, val_preds_fold, score_train_fold, score_val_fold , model_out= base_model(X_train,y_train,X_test,y_test, name,model, X, df_tst, False)
    
    return score_val_fold

In [None]:
OPTUNA = False
if OPTUNA:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    print('Number of finished trials:', len(study.trials))
    Best_params = study.best_trial.params
    
    print('Best trial:', Best_params)

In [None]:
lgb_params ={'objective': 'regression',# regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie
             "metric":"rmse", 
             "boosting": "gbdt",#"dart",gbdt
#              'lambda_l1': 1.0050418664783436e-08, 
#              'lambda_l2': 9.938606206413121,
#              'num_leaves': 44,
#               'feature_fraction': 0.8247273276668773,
#               'bagging_fraction': 0.5842711778104962,
#             'bagging_freq': 6,
#              'min_data_in_leaf': 134,
#              'min_child_samples': 70,
#              'max_depth': 8,   
             'n_estimators':EPOCHS,
             'learning_rate':0.08,
            'device':'cpu'}

xgb_params = { 
    'objective' : "reg:squarederror",
    'n_estimators' : EPOCHS, 
    'early_stopping_rounds' :30,
    'eval_metric' : "rmse",
    'learning_rate': 0.55,
    #'max_depth': 5,
    'booster' : 'gbtree'#'gbtree',dart
             }
cat_params = {'iterations':EPOCHS,
              'eval_metric' : "RMSE", 
              'learning_rate': 0.08,
              'loss_function':'RMSE'
             }
ET_params = {'max_depth':25, 'n_estimators':EPOCHS}
RF_params = { 'max_depth':10,'n_estimators':EPOCHS,
            }


In [None]:
if OPTUNA and name =="xgboost":
    xgb_params.update(Best_params)

# 🦠 Cross Validation 🦠

In [None]:
kfold = RepeatedKFold(n_splits=NUM_FOLDS, n_repeats=NUM_SPLITS, random_state=42)
 
if name =="xgboost":
    model= xgb.XGBRegressor(**xgb_params)
elif name =="lightgbm":
    model= lgb.LGBMRegressor(**lgb_params)
elif name == "rf":
    model = RandomForestRegressor(**RF_params)
elif name == "catboost":
    model = cat.CatBoostRegressor(**cat_params)
if name == "LinearRegression":
    model = LinearRegression()
if name == "ET":
    model =ExtraTreesRegressor(**ET_params)

We will validate our models only on Non-Added data i.e. validate only on the data provided by the competition

In [None]:
if TEST_ON_GENERATED_ONLY:
    X_validation = X["is_generated"]==1

In [None]:
def cross_val(X,y, df_tst_in):

    score_train = []
    score_val= []
    score2_val = []
    OOF_preds = []
    list_shap_values= []
    OOF_trn_preds = []  
    feature_importance = pd.DataFrame(0, index = X.columns, columns = ["Importance"])

    if TEST_ON_GENERATED_ONLY:
        index_ = X[X["is_generated"]==1].index
        OOF_val_preds=  pd.DataFrame(0, index = y[index_].index , columns =[col for col in range(NUM_SPLITS*NUM_FOLDS)])
    else:
        OOF_val_preds= pd.DataFrame(0, index = y.index , columns =[col for col in range(NUM_SPLITS*NUM_FOLDS)])


    for fold, (train_idx,val_idx) in enumerate(kfold.split(X,y)):
        X_train,y_train = X.iloc[train_idx,:], y.iloc[train_idx]
        X_test,y_test = X.iloc[val_idx,:], y.iloc[val_idx]
        print("\n###### FOLD",fold,"######")

        if TEST_ON_GENERATED_ONLY:
            index_val = X_test[X_test["is_generated"]==1].index
            y_test = y_test[index_val]
            X_test = X_test.loc[index_val, :]
            val_idx = index_val


        #Scaling
        if name in ["LinearRegression",'Ridge', "SVR"]:
            scaler = StandardScaler()
            X_train_s = scaler.fit_transform(X_train[lin_cols])
            X_test_s  = scaler.transform(X_test[lin_cols])
            X_temp = X.copy(deep = True)[lin_cols]
            X_temp = scaler.transform(X_temp)
            test_temp = scaler.transform(df_tst_in[lin_cols])
            y_train_s = y_train.copy(deep = True)
            
        elif SCALING:
            X_train_s = SCALER.fit_transform(X_train)
            X_test_s  = SCALER.transform(X_test)
            X_temp = X.copy(deep = True)
            X_temp = SCALER.transform(X_temp)
            test_temp = SCALER.transform(df_tst_in)
            y_train_s = y_train.copy(deep = True)
            
        else:
            #X_train_s, X_test_s , test_temp, X_temp = Scaling(X_train, X_test , df_tst, X )
            X_test_s = X_test.copy(deep = True)
            X_train_s = X_train.copy(deep = True)
            X_temp = X.copy(deep = True)
            test_temp = df_tst_in.copy(deep = True)
            y_train_s = y_train.copy(deep = True)

        #Fit
        test_preds_fold, trn_preds_fold, val_preds_fold, score_train_fold, score_val_fold , model_out= base_model(X_train_s,y_train_s,X_test_s,y_test, name,model, X_temp, test_temp, False)
        if name not in ["LinearRegression",'Ridge', "SVR"]:
            feature_importance += pd.DataFrame(model_out.feature_importances_,index = X.columns, columns = ["Importance"])

        # use best model for predictions    
        OOF_val_preds.loc[val_idx, fold] = OOF_val_preds.loc[val_idx, fold] + val_preds_fold/NUM_SPLITS

        score_val.append(score_val_fold)
        score2_val.append(r2_score(y_test, val_preds_fold))
        score_train.append(score_train_fold)
        OOF_trn_preds.append(trn_preds_fold )
        OOF_preds.append(test_preds_fold)

        #Shap values of best model
        if SHAP_VALS:
            explainer = shap.Explainer(model)
            shap_values_cv = explainer.shap_values(test_temp)
            list_shap_values.append(shap_values_cv)

        print(f"Trn score: {score_train_fold:.0f}")
        print(f"Val score: {score_val_fold:.0f}")
        print(f"Val Score2: {r2_score(y_test, val_preds_fold):.4f}")


    if TEST_ON_GENERATED_ONLY:
        print("\nMEAN Trn score:",np.mean(score_train))
        print("MEAN Val score:",mean_squared_error(y[index_], OOF_val_preds.sum(axis=1))**0.5)
        print(f"MEAN Val Score2: { r2_score(y[index_], OOF_val_preds.sum(axis =1)) }")
    
    else:
        print("\nMEAN Trn score:",np.mean(score_train))
        print("MEAN Val score:",mean_squared_error(y, OOF_val_preds.sum(axis=1))**0.5)
        print(f"MEAN Val Score2:{ r2_score(y, OOF_val_preds.sum(axis =1))}")
    
    if SHAP_VALS:
        return OOF_preds, OOF_trn_preds, OOF_val_preds, score_train, score_val, list_shap_values, feature_importance
    else:
        return OOF_preds, OOF_trn_preds, OOF_val_preds, score_train, score_val, feature_importance

In [None]:
if SHAP_VALS:
    test_preds, train_preds, val_preds, OOF_trn_score, OOF_val_score, shap_values, feature_importance = cross_val(X,y, df_tst)
else:
    test_preds, train_preds, val_preds, OOF_trn_score, OOF_val_score ,feature_importance= cross_val(X,y, df_tst)

In [None]:
if TEST_ON_GENERATED_ONLY:
    val_score = mean_squared_error(y[X[X["is_generated"]==1].index],val_preds.sum(axis=1) )**0.5
else:
    val_score = mean_squared_error(y,val_preds.sum(axis=1) )**0.5

In [None]:
sub[target]  = 0
sub_cv=  sub.copy(deep= True)
sub_cv[target] = np.mean(test_preds,axis =0)
sub_cv.to_csv(f"sub_cv_{name}_{val_score:.0f}.csv")
sub_cv.head()

### Weighted

In [None]:
sub_ensemble  =sub.copy(deep = True)
sub_ensemble_trn = y.copy(deep = True)

for i, preds in enumerate(test_preds):
    sub_ensemble[target] = sub_ensemble[target] + (test_preds[i] * OOF_val_score[i]) #multiply preds by their corresponding  score
    sub_ensemble_trn= sub_ensemble_trn + (train_preds[i] * OOF_val_score[i])
    
sub_ensemble_trn =   (sub_ensemble_trn/ sum(OOF_val_score) ).values
sub_ensemble[target] =   (sub_ensemble[target]/ sum(OOF_val_score)).values 
sub_ensemble.head()

In [None]:
weighted_score = mean_squared_error(y,sub_ensemble_trn )**0.5
print("Weighted Score",weighted_score)

sub_ensemble.to_csv(f"sub_weighted_ensemble_{weighted_score:.0f}.csv")
sub_ensemble.head()

### Calibration

In [None]:
# Get a dataframe of train and test predictions 
all_trn_preds = pd.DataFrame(index = df_trn.index)
all_tst_preds = pd.DataFrame(index = df_tst.index)

for i, preds in enumerate(train_preds):
    all_trn_preds[f"{OOF_val_score[i]}"] = train_preds[i]
    all_tst_preds[f"{OOF_val_score[i]}"] = test_preds[i]
all_tst_preds.head()

In [None]:
plt.figure(figsize = (20,7))
sns.heatmap(all_trn_preds.corr(), vmin=-1,vmax=1,cmap='Spectral',annot=True)
plt.show()

In [None]:
CALIBRATION=='ridge'
if CALIBRATION == "linear":
    lin_model = LinearRegression()
elif CALIBRATION=='ridge': 
    lin_model = Ridge(alpha=2)

X = all_trn_preds#.iloc[:,:1]
OOF_val_scores_lin = []
OOF_trn_lin= []

OOF_val_lin=  pd.DataFrame(0, index = y.index , columns =[col for col in range(NUM_SPLITS*NUM_FOLDS)])

for fold, (train_idx,val_idx) in enumerate(kfold.split(X,y)):
    X_train,y_train = X.iloc[train_idx,:], y.iloc[train_idx]
    X_test,y_test = X.iloc[val_idx,:], y.iloc[val_idx]
    
    test_preds_lin, trn_preds_lin, val_preds_lin, score_trn_lin, score_val_lin , model_out= base_model(X_train,y_train,X_test,y_test, "ridge",lin_model, X, all_tst_preds, False)
    
    OOF_val_lin.loc[val_idx, fold] = OOF_val_lin.loc[val_idx, fold] + val_preds_lin/NUM_SPLITS
    
    OOF_val_scores_lin.append(score_val_lin)
    OOF_trn_lin.append(trn_preds_lin)
    
#cal_score = np.mean(OOF_val_scores_lin)
cal_score = mean_squared_error(y,OOF_val_lin.sum(axis =1))**0.5
cal_trn_preds  = np.mean(OOF_trn_lin,axis=0)
print(f"Mean RSME: {cal_score:.0f}")

In [None]:
#readd_newdocfit on full data 
lin_model.fit(all_trn_preds, np.ravel(y)) # all_trn_preds
y_cal_test  = lin_model.predict(all_tst_preds)

full_fit_RSME = mean_squared_error(y, lin_model.predict(all_trn_preds))**0.5

print(f"Full fit RSME:{full_fit_RSME:.0f}")

sub_cal = sub.copy(deep = True)
sub_cal[target] = y_cal_test
sub_cal.to_csv(f"sub_cal_{cal_score:.0f}.csv")
sub_cal.head()

# 🔎 Model Analysis (Partial Dependecy Plots) 🔎
* Partial dependency plots will help us understand how each feature behaves. The process tries to change the value of the selected feature and monitors the models predicted output 

In [None]:
# from pdpbox import pdp, get_dataset, info_plots
# pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test
#                            , model_features=X.columns
#                            , feature="squareMeters")
# pdp.pdp_plot(pdp_dist, feature_name="squareMeters",figsize =(25,7))
# plt.tight_layout()
# plt.show()

In [None]:
# pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test_s
#                            , model_features=X.columns
#                            , feature="sym_1")
# pdp.pdp_plot(pdp_dist, feature_name="sym_1",figsize =(25,7))
# plt.tight_layout()
# plt.show()

## Feature importance 

In [None]:
if name not in ["LinearRegression",'Ridge', "SVR"]:
    feature_importance = feature_importance.sort_values(by= "Importance", ascending = False)
    plt.figure(figsize = (20,12))
    sns.barplot(x = feature_importance["Importance"], y= feature_importance.index)
    plt.show()

## Analyse Residuals 
Lets look at what incorrect predictions our model made

In [None]:
fig, ax = plt.subplots(3,1, figsize = (30,20),sharey = True)
ax = np.ravel(ax)

res_cal = y -cal_trn_preds
ax[0].scatter(x = [i for i in range(len(res_cal))], y = res_cal, label = f"Calibration, Score: {cal_score}, (Var:{np.var(res_cal):.0f})", color = "r")
ax[0].set_title("Calibration Residuals", fontsize = 12)

res_trn = y.values- sub_ensemble_trn
ax[1].scatter(x = [i for i in range(len(res_trn))], y = res_trn ,label = f"TRN score:{weighted_score:.0f} , (Var:{np.var(res_trn):.0f})")
ax[1].set_title("Train Residuals", fontsize = 12)

if TEST_ON_GENERATED_ONLY:
    res_val = y[df_trn[df_trn["is_generated"]==1].index].values - val_preds.sum(axis =1)
else:
    res_val = y.values - val_preds.sum(axis =1)
ax[2].scatter(x = [i for i in range(len(res_val))], y = res_val, label = f"Val Score: {val_score:.0f} , (Var:{np.var(res_val):.0f})" ,color ="g")
ax[2].set_title("Validation Residuals", fontsize = 12)

fig.suptitle("Residual analysis", fontsize = 20)

ax[0].legend(fontsize= 15)
ax[1].legend(fontsize= 15)
ax[2].legend(fontsize= 15)
plt.tight_layout(pad =4)
plt.show()

## Analyse Outliers

In [None]:
plt.figure(figsize = (25,7))
sns.boxplot(res_val)
res_val[(res_val>res_val.quantile(0.9999)) | (res_val<-res_val.quantile(0.9999))].hist(bins =20)
plt.title("Residual Values: Outliers (boxplot and histogram of 0.9999 quantile)", fontsize = 15)
plt.show()

In [None]:
df_trn.describe()

# 🧨 Feature Importance 🧨

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x="Importance", y=feature_importance.sort_values("Importance",ascending = False).index, data=feature_importance.sort_values("Importance",ascending = False))
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
print("Features with zero importance")
print([col for col in feature_importance[feature_importance["Importance"] ==0].index])

In [None]:
print([col for col in df_trn.columns if col not in feature_importance[feature_importance["Importance"] ==0].index])

# 🟧 Shap Values 🟧
* Shap values are a great way to visualise how our model processed these new features

In [None]:
shap.initjs() # for visualization 

NB we are working with the shap values for on the test data. so the shap values might be wrong (as would they be for any validation) 

In [None]:
# this is the ABSOLUTE MEAN ( i.e the magnitude of the shap values )
if SHAP_VALS:
    folds = NUM_SPLITS* NUM_FOLDS

    shap_values_cv = np.abs(shap_values[0])/folds

    for i  in range(1,len(shap_values)):
        shap_values_cv = shap_values_cv +(np.abs(shap_values[i]))/folds

    shap.summary_plot(shap_values_cv, df_tst,max_display = 4000, plot_size = [30,20], show= False)
    plt.title("Total Shap values across all folds", fontsize = 20)
    plt.show()

### Shap interaction values
* The interaction plots help us understand how our each feature "interacts" with the others with respect to our models predictio
* I have excluded interaction value as it uses too much memory to concatenate all the values from the CV (however we can do this for one model if needed)

In [None]:
# #Get Shap values
# explainer_i = shap.Explainer(model)
# shap_values_ = explainer.shap_values(test_temp)
# # Get interaction values
# shap_interaction_values = explainer_i.shap_interaction_values(test_temp)

# Score

In [None]:
print(f"OOF TRN Score: {np.mean(OOF_trn_score,axis =0):.0f}")
print(f"OOF val Score: {val_score:.0f}")
#print(f"Weighted Score: {weighted_score:.0f}")
print(f"Calibrated Score val: {cal_score:.0f}, trn :{full_fit_RSME:.0f}")

#### catboost

lr 0.08
* cal trn 64726

#### lightgbm 

learning rate 0.08
* cal trn : 106709

#### ET 

depth 25 (PL =182990)
* OOF TRN Score: 66345
* OOF val Score: 178190
* Calibrated Score val: 150248, trn :24401

depth 20
* oof TRN score: 66395
* oof val score:178286
* cal trn: 24615

depth 15
* OOF TRN Score: 69217
* OOF val Score: 179278
* Calibrated Score val: 210068, trn :37458

#### XGBOOST 

'learning_rate': 0.55
* OOF TRN Score: 71658
* OOF val Score: 148860
* Calibrated Score val: 73104, trn :17616

'learning_rate': 0.80 + 5 max_depth
* OOF TRN Score: 82488
* OOF val Score: 147408
* Calibrated Score val: 67573, trn :26871

#### Lin Regression 

Dropped SquareMeters
* OOF TRN Score: 190256
* OOF val Score: 228184
* Calibrated Score val: 190347, trn :190243

baseline
* OOF TRN Score: 813998
* OOF val Score: 3987262
* Calibrated Score val: 3322787, trn :186983