# Cross Validation in `Python`

In [24]:
import pandas as pd
import numpy as np
import math

In [46]:
import sklearn

from sklearn.neighbors import NearestNeighbors

In [25]:
Data = pd.read_csv('House_Price_Regression.csv')

Data = Data.loc[:, ['latitude', 'longitude', 'price', 'size_in_m_2', 'balcony_recode', 'private_garden_recode', 'quality_recode']]

Data.head()

Unnamed: 0,latitude,longitude,price,size_in_m_2,balcony_recode,private_garden_recode,quality_recode
0,25.113208,55.138932,2700000,100.242337,1.0,0.0,2.0
1,25.106809,55.151201,2850000,146.972546,1.0,0.0,2.0
2,25.063302,55.137728,1150000,181.253753,1.0,0.0,2.0
3,25.227295,55.341761,2850000,187.66406,1.0,0.0,1.0
4,25.114275,55.139764,1729200,47.101821,0.0,0.0,2.0


## Not random simple validation

In [68]:
def simple_validation_not_random(D, k, response):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # response --> have to be a string with the name of the response variable.

    N = len(D)

    D_train = D.iloc[0:(math.floor(k*N)+1) , :]

    D_test = D.iloc[(math.floor(k*N)+1):N , :]

    X_train = D_train.loc[: , D_train.columns != response]
    Y_train = D_train.loc[: , response]

    X_test = D_test.loc[: , D_test.columns != response]
    Y_test = D_test.loc[: , response]

############################################################################

    knn_regression.fit(X_train, Y_train)

    Y_predict_test = knn_regression.predict( X_test ) 

############################################################################

    ECM_test = np.mean( (Y_predict_test - Y_test)**2 )

############################################################################

    return ECM_test

In [69]:
simple_validation_not_random(D=Data, k=0.75, response='price')

2198758842331.1843

## Random simple validation

In [75]:
def simple_validation_random(D, k, response, random_seed):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # response --> have to be a string with the name of the response variable.

    # random_seed --> seed to replicate the random process

    N = len(D)

    D_train = D.sample(frac=k, replace=False, random_state=random_seed)

    D_test = D.drop( D_train.index , )

    X_train = D_train.loc[: , D_train.columns != response]
    Y_train = D_train.loc[: , response]

    X_test = D_test.loc[: , D_test.columns != response]
    Y_test = D_test.loc[: , response]

############################################################################

    knn_regression.fit(X_train, Y_train)

    Y_predict_test = knn_regression.predict( X_test ) 

############################################################################

    ECM_test = np.mean( (Y_predict_test - Y_test)**2 )

############################################################################

    return ECM_test

In [76]:
simple_validation_random(D=Data, k=0.75, response='price', random_seed=123)

2505043526308.227

## Repeted random simple validation

In [100]:
def repeted_random_simple_validation(D, k, B, response, random_seed):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # B --> number of replications of the random simple validation algorithm

    # response --> have to be a string with the name of the response variable.

    # random_seed --> seed to replicate the random process


    np.random.seed(random_seed)

    ECM_test_list = [ ]

    seed_array = np.random.randint(999999999, size=(1000))

    for b in range(0,B) :

        ECM_test_list.append( simple_validation_random(D, k, response, random_seed=seed_array[b]) )


    ECM_test = np.mean(ECM_test_list)    

    return ECM_test , ECM_test_list

In [101]:
ECM_test , ECM_test_list = repeted_random_simple_validation(D=Data, k=0.75, B=1000, response='price', random_seed=123)

In [103]:
ECM_test

2317146868016.2695

## K-folds

# Repeted K FOLDS CV


In [None]:
import pandas as pd
import numpy as np


In [None]:

url = 'https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Boston.csv'

Boston = pd.read_csv(url)

In [None]:
url = 'https://raw.githubusercontent.com/FabioScielzoOrtiz/Estadistica4all-blog/main/Linear%20Regression%20in%20Python%20and%20R/properties_data.csv'

House_Price = pd.read_csv(url)

In [None]:
def varcharProcessing(X, varchar_process = "dummy_dropfirst"):
    
    dtypes = X.dtypes

    if varchar_process == "drop":   
        X = X.drop(columns = dtypes[dtypes == np.object].index.tolist())

    elif varchar_process == "dummy":
        X = pd.get_dummies(X,drop_first=False)

    elif varchar_process == "dummy_dropfirst":
        X = pd.get_dummies(X,drop_first=True)

    else: 
        X = pd.get_dummies(X,drop_first=True)
    
    X["intercept"] = 1
    cols = X.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    X = X[cols]
    
    return X

In [None]:
House_Price = House_Price.loc[: , (House_Price.columns != 'neighborhood') &  (House_Price.columns != 'id') ]

In [None]:
House_Price = varcharProcessing(House_Price, varchar_process = "dummy_dropfirst")

In [None]:
House_Price.dtypes

intercept                int64
latitude               float64
longitude              float64
price                    int64
size_in_sqft             int64
price_per_sqft         float64
no_of_bedrooms           int64
no_of_bathrooms          int64
maid_room                 bool
unfurnished               bool
balcony                   bool
barbecue_area             bool
built_in_wardrobes        bool
central_ac                bool
childrens_play_area       bool
childrens_pool            bool
concierge                 bool
covered_parking           bool
kitchen_appliances        bool
lobby_in_building         bool
maid_service              bool
networked                 bool
pets_allowed              bool
private_garden            bool
private_gym               bool
private_jacuzzi           bool
private_pool              bool
security                  bool
shared_gym                bool
shared_pool               bool
shared_spa                bool
study                     bool
vastu_co

In [None]:
import sklearn

from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression().fit(House_Price.loc[: , House_Price.columns != 'price' ], House_Price.loc[: , House_Price.columns == 'price' ])

In [None]:
from sklearn.linear_model import LinearRegression

import math

In [None]:
def Repeted_K_Fold_CV( Data, response_name , K , n_iter ):


    from sklearn.utils import resample

    ECM_Repeted_K_Folds_vector = []

    size_particiones_test = []


    for iter in range(0, n_iter):

        sample = resample(range(0, len(Data)), n_samples=len(Data), replace=False, random_state=iter)

        df_sample = pd.DataFrame({'index': range(0,len(Data)) , 'sample':sample})

        
        Q = []

        
        for q in np.arange(0 , 1 + 1/K , 1/K):

            Q.append( np.quantile( range(0, len(Data)) , q ).round(0) )



        ECM_K_FOLDS_vector = []

        for j in range(0, len(Q)-1):

            X_test = Data.loc[df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] , Data.columns != response_name ] 
            Y_test = Data.loc[df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] , Data.columns == response_name ]

            X_train = Data.loc[ : , Data.columns != response_name ].drop(df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] )
            Y_train = Data.loc[ : ,  Data.columns == response_name ].drop(df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'])

            size_particiones_test.append( len(X_test) )
            

            Y_test = Y_test.to_numpy()

            model = LinearRegression().fit(X_train, Y_train)

            ECM_K_FOLDS_vector.append( (( model.predict(X_test) - Y_test )**2 ).sum() / len(Y_test) )

    
        ECM_Repeted_K_Folds_vector.append( np.array(ECM_K_FOLDS_vector).mean() )
    
    ECM_Repeted_K_Folds = np.array( ECM_Repeted_K_Folds_vector ).mean()
    
    return(ECM_Repeted_K_Folds_vector , ECM_Repeted_K_Folds, size_particiones_test)

In [None]:
ECM_Repeted_K_Folds_vector , ECM_Repeted_K_Folds, size_particiones_test = Repeted_K_Fold_CV( Data=House_Price , response_name='price' , K=10 , n_iter=50 )

In [None]:
ECM_Repeted_K_Folds

872856402907.0691

In [None]:
len(size_particiones_test)

500

In [None]:
tamaños_particiones

[191, 190, 191, 190, 190, 191, 190, 191, 190]