In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import math
import sklearn
import itertools

from sklearn.utils import resample
from sklearn.neighbors import NearestNeighbors

In [3]:
Data = pd.read_csv('House_Price_Regression.csv')

Data = Data.loc[:, ['latitude', 'longitude', 'price', 'size_in_m_2', 'balcony_recode', 'private_garden_recode', 'quality_recode']]

Data.head()

Unnamed: 0,latitude,longitude,price,size_in_m_2,balcony_recode,private_garden_recode,quality_recode
0,25.113208,55.138932,2700000,100.242337,1.0,0.0,2.0
1,25.106809,55.151201,2850000,146.972546,1.0,0.0,2.0
2,25.063302,55.137728,1150000,181.253753,1.0,0.0,2.0
3,25.227295,55.341761,2850000,187.66406,1.0,0.0,1.0
4,25.114275,55.139764,1729200,47.101821,0.0,0.0,2.0


## Algoritmos de validación

In [4]:
def simple_validation_random(D, k, response, random_seed, metric, model):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # response --> have to be a string with the name of the response variable.

    # random_seed --> seed to replicate the random process

    N = len(D)

    D_train = D.sample(frac=k, replace=False, random_state=random_seed)

    D_test = D.drop( D_train.index , )

    X_train = D_train.loc[: , D_train.columns != response]
    Y_train = D_train.loc[: , response]

    X_test = D_test.loc[: , D_test.columns != response]
    Y_test = D_test.loc[: , response]

############################################################################

    if model == 'knn_regression' :

     # Training the model wit train sample

        knn_regression.fit(X_train, Y_train)


     # Making predictions with test sample

        Y_predict_test = knn_regression.predict( X_test ) 

    
    elif model == 'knn_classification' :

     # Training the model wit train sample

        knn_classification.fit(X_train, Y_train)


     # Making predictions with test sample

        Y_predict_test = knn_classification.predict( X_test ) 

############################################################################

 # Computing the test metric

    if metric == 'ECM' :  
        
        ECM_test = np.mean( (Y_predict_test - Y_test)**2 )

        return ECM_test

    elif metric == 'TA' :  
        
        TA_test = np.mean( (Y_predict_test == Y_test) )

        return TA_test

In [5]:
def repeated_random_simple_validation(D, k, B, response, random_seed, metric, model):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # B --> number of replications of the random simple validation algorithm

    # response --> have to be a string with the name of the response variable.

    # random_seed --> seed to replicate the random process


    np.random.seed(random_seed)

    ECM_test_list , TA_test_list = [ ] , [ ]

    seed_array = np.random.randint(9999999, size=(B))


    if metric == 'ECM':

        for b in range(0,B) :

            ECM_test_list.append( simple_validation_random(D, k, response, random_seed=seed_array[b], metric=metric, model=model) )


        ECM_test = np.mean(ECM_test_list)    

        return ECM_test 


    elif metric == 'TA':

        for b in range(0,B) :

            TA_test_list.append( simple_validation_random(D, k, response, random_seed=seed_array[b], metric=metric, model=model) )


        TA_test = np.mean(TA_test_list)    

        return TA_test 

In [6]:
def repeated_K_Fold_CV(D, B, K, response, random_seed, metric, model):


    ECM_Repeted_K_Folds_vector , TA_Repeted_K_Folds_vector = [] , []

    np.random.seed(random_seed)


    for b in range(0, B):

        sample = resample(range(0, len(D)), n_samples=len(D), replace=False)

        df_sample = pd.DataFrame({'index': range(0,len(D)) , 'sample':sample})

        
        Q = []

        
        for q in np.arange(0 , 1 + 1/K , 1/K):

            Q.append( np.quantile( range(0, len(D)) , q ).round(0) )



        ECM_K_FOLDS_vector , TA_K_FOLDS_vector = [] , []

        for j in range(0, len(Q)-1):

            X_test = D.loc[df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] , D.columns != response ] 
            Y_test = D.loc[df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] , D.columns == response ]

            X_train = D.loc[ : , D.columns != response ].drop(df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] )
            Y_train = D.loc[ : ,  D.columns == response ].drop(df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'])      

            Y_test = Y_test.to_numpy()

        #######################################################################
            
            if model == 'knn_regression' :

              # Training the model wit train sample

                knn_regression.fit(X_train, Y_train)

               # Making predictions with test sample

                Y_predict_test = knn_regression.predict( X_test ) 

    
            elif model == 'knn_classification' :

               # Training the model wit train sample

                knn_classification.fit(X_train, Y_train)

               # Making predictions with test sample

                Y_predict_test = knn_classification.predict( X_test )  

        #######################################################################

            if metric == 'ECM' :  ECM_K_FOLDS_vector.append( np.mean( ( Y_predict_test - Y_test )**2 ) )

            elif metric == 'TA' :  TA_K_FOLDS_vector.append( np.mean( ( Y_predict_test == Y_test ) ) )

        #######################################################################
    
        if metric == 'ECM' : ECM_Repeted_K_Folds_vector.append( np.mean(ECM_K_FOLDS_vector) )

        elif metric == 'TA' : TA_Repeted_K_Folds_vector.append( np.mean(TA_K_FOLDS_vector) )

##########################################################################################################################

    if metric == 'ECM' :
        
        ECM_Repeted_K_Folds = np.mean(ECM_Repeted_K_Folds_vector)

        return  ECM_Repeted_K_Folds
        

    elif metric == 'TA' :
        
        TA_Repeted_K_Folds = np.mean(TA_Repeted_K_Folds_vector)

        return  TA_Repeted_K_Folds
    
    

## Grid search con 1 hiper parametros

In [7]:
def Grid_search(search_space, )

SyntaxError: expected ':' (4270892657.py, line 1)

In [None]:
Grid_Search_Metric_list = [] 

Search_Space = range(1,100)

for k in Search_Space:

    knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k ,  p=2, metric='minkowski')

    Grid_Search_Metric_list.append( repeated_random_simple_validation(D=Data, k=0.75, B=50, response='price', random_seed=123, metric='ECM', model='knn_regression') )

In [None]:
df = pd.DataFrame({'k': Search_Space  , 'ECM': Grid_Search_Metric_list})

In [None]:
df.sort_values(by='ECM')

Unnamed: 0,k,ECM
15,16,2.275714e+12
16,17,2.281876e+12
14,15,2.286231e+12
17,18,2.290355e+12
13,14,2.291663e+12
...,...,...
94,95,4.145928e+12
95,96,4.157107e+12
96,97,4.168150e+12
97,98,4.180820e+12


In [None]:
sns.lineplot(y=df.ECM , x=df.k, linewidth=2.5)

NameError: name 'sns' is not defined

## Grid search con 2 hiper parametros

In [9]:
def Grid_search(Data, response, model, validation, metric, Search_Space, random_seed=123, B=50, k=0.75, K=10):

   Grid_Search_Metric_list = []


   if validation == 'repeated_random_simple_validation':

      if model == 'knn_regression':

         hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0] ,  metric=h[1])

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_random_simple_validation(D=Data, k=k, B=B, response=response, random_seed=random_seed, metric=metric, model='knn_regression') )


      if model == 'knn_classification':

         hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

         for h in hyperparameter_combinations :

            # Setting the hyperparameters of the model

            knn_classification = sklearn.neighbors.KNeighborsClassifier(n_neighbors=h[0] ,  metric=h[1])

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_random_simple_validation(D=Data, k=k, B=B, response=response, random_seed=random_seed, metric=metric, model=model) )


##########################################################################################################################################################################

   if validation == 'repeated_K_Fold_CV':

      if model == 'knn_regression':

         hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

         for h in hyperparameter_combinations :

            # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0] ,  metric=h[1])

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_K_Fold_CV(D=Data, K=K, B=B, response=response, random_seed=random_seed, metric=metric, model=model) )


      if model == 'knn_classification':

         hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

         for h in hyperparameter_combinations :

            # Setting the hyperparameters of the model

            knn_classification = sklearn.neighbors.KNeighborsClassifier(n_neighbors=h[0] ,  metric=h[1])

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_K_Fold_CV(D=Data, B=B, K=K, response=response, random_seed=random_seed, metric=metric, model=model) )


   return Grid_Search_Metric_list , hyperparameter_combinations

In [14]:
Search_Space=[ range(1,5) , ['euclidean','cosine','cityblock','manhattan'] ]

In [15]:
hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

In [32]:
hyperparameter_combinations[0][1]

'euclidean'

In [22]:
for h in hyperparameter_combinations:

    print([h[0],h[1]])

[1, 'euclidean']
[1, 'cosine']
[1, 'cityblock']
[1, 'manhattan']
[2, 'euclidean']
[2, 'cosine']
[2, 'cityblock']
[2, 'manhattan']
[3, 'euclidean']
[3, 'cosine']
[3, 'cityblock']
[3, 'manhattan']
[4, 'euclidean']
[4, 'cosine']
[4, 'cityblock']
[4, 'manhattan']


In [20]:
knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=hyperparameter_combinations[5][0] ,  metric=hyperparameter_combinations[0][1])

In [33]:
repeated_random_simple_validation(D=Data, k=0.75, B=5, response='price', random_seed=123, metric='ECM', model='knn_regression')

2818148343454.6445

In [16]:
hyperparameter_combinations

[(1, 'euclidean'),
 (1, 'cosine'),
 (1, 'cityblock'),
 (1, 'manhattan'),
 (2, 'euclidean'),
 (2, 'cosine'),
 (2, 'cityblock'),
 (2, 'manhattan'),
 (3, 'euclidean'),
 (3, 'cosine'),
 (3, 'cityblock'),
 (3, 'manhattan'),
 (4, 'euclidean'),
 (4, 'cosine'),
 (4, 'cityblock'),
 (4, 'manhattan')]

In [10]:
Grid_Search_Metric_list , hyperparameter_combinations = Grid_search(Data=Data, response='price', model='knn_regression', validation='repeated_random_simple_validation', metric='ECM', Search_Space=[ range(1,5) , ['euclidean','cosine','cityblock','manhattan'] ], random_seed=123, B=4, k=0.75, K=10)

NameError: name 'knn_regression' is not defined

In [30]:
Grid_Search_Metric_list

[2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767,
 2682062618931.5767]

In [None]:
Data=Data; response='price'; model='knn_regression'; validation='repeated_random_simple_validation'; metric='ECM'; search_space=[ range(1,5) , ['euclidean','cosine','cityblock','manhattan'] ]; random_seed=123; B=5; k=0.75; K=10


In [None]:

         Grid_Search_Metric_list2 = [] 

         hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0] ,  metric=h[1])

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list2.append( repeated_random_simple_validation(D=Data, k=k, B=B, response=response, random_seed=random_seed, metric=metric, model=model) )
 

 

In [None]:
Grid_Search_Metric_list2

[3410315906546.118,
 3098939607695.841,
 3457732670804.5205,
 3457732670804.5205,
 2818148343454.6445,
 2494758524116.1943,
 2865713508870.8525,
 2865713508870.8525,
 2494580500999.9214,
 2387839113506.316,
 2463128521481.6885,
 2463128521481.6885,
 2437517134420.8403,
 2267589769142.758,
 2377671864672.011,
 2377671864672.011]

In [None]:
Grid_Search_Metric_list = [] 
hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

for h  in hyperparameter_combinations :

         # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0] ,  metric=h[1])

         # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_random_simple_validation(D=Data, k=0.75, B=5, response='price', random_seed=123, metric='ECM', model='knn_regression') )


In [None]:
Grid_Search_Metric_list

[3410315906546.118,
 3098939607695.841,
 3457732670804.5205,
 3457732670804.5205,
 2818148343454.6445,
 2494758524116.1943,
 2865713508870.8525,
 2865713508870.8525,
 2494580500999.9214,
 2387839113506.316,
 2463128521481.6885,
 2463128521481.6885,
 2437517134420.8403,
 2267589769142.758,
 2377671864672.011,
 2377671864672.011]

In [None]:
grid_search_metric_list , hyperparameter_combi = Grid_search(Data=Data, response='price', model='knn_regression', validation='repeated_K_Fold_CV', metric='ECM', search_space=[ range(1,5) , ['euclidean','cosine','cityblock','manhattan'] ], random_seed=123, B=10, k=0.75, K=10)

In [None]:
df = pd.DataFrame({'hyperparameter_combi': hyperparameter_combi, 'ECM': grid_search_metric_list})

In [None]:
df.sort_values(by='ECM')

Unnamed: 0,hyperparameter_combi,ECM
0,"(1, euclidean)",2447087000000.0
1,"(1, cosine)",2447087000000.0
2,"(1, cityblock)",2447087000000.0
3,"(1, manhattan)",2447087000000.0
4,"(2, euclidean)",2447087000000.0
5,"(2, cosine)",2447087000000.0
6,"(2, cityblock)",2447087000000.0
7,"(2, manhattan)",2447087000000.0
8,"(3, euclidean)",2447087000000.0
9,"(3, cosine)",2447087000000.0


In [None]:
grid_search_metric_list , hyperparameter_combi = Grid_search(Data=Data, response='price', model='knn_regression', validation='repeated_random_simple_validation', metric='ECM', search_space=[ range(1,5) , ['euclidean','cosine','cityblock','manhattan'] ], random_seed=123, B=10, k=0.75, K=5)

In [None]:
Grid_Search_Metric_list , Search_Space_1_list, Search_Space_2_list = [] , [] , []

Search_Space_1 = range(1,5)

Search_Space_2 = ['euclidean','cosine','cityblock','manhattan']

Search_Space = [Search_Space_1 , Search_Space_2]

hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

for k in hyperparameter_combinations :

    Search_Space_1_list.append(k[0])  # n_neighbors
    Search_Space_2_list.append(k[1])  # metric

    knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k[0] ,  metric=k[1])

    Grid_Search_Metric_list.append( repeated_K_Fold_CV(D=Data, K=5, B=10, response='price', random_seed=123, metric='ECM', model='knn_regression') )

In [None]:
df = pd.DataFrame({'k': Search_Space_1_list, 'Distance': Search_Space_2_list, 'ECM': Grid_Search_Metric_list})

In [None]:
df

Unnamed: 0,k,Distance,ECM
0,1,euclidean,3501571000000.0
1,1,cosine,3348080000000.0
2,1,cityblock,3806527000000.0
3,1,manhattan,3806527000000.0
4,2,euclidean,2776179000000.0
5,2,cosine,2670369000000.0
6,2,cityblock,2837147000000.0
7,2,manhattan,2837147000000.0
8,3,euclidean,2552322000000.0
9,3,cosine,2559354000000.0


In [None]:
df = pd.DataFrame({'hyperparameter_combinations': hyperparameter_combinations, 'ECM': Grid_Search_Metric_list})

In [None]:
df.sort_values(by='ECM')

Unnamed: 0,hyperparameter_combinations,ECM
57,"(15, cosine)",2.245434e+12
61,"(16, cosine)",2.249944e+12
53,"(14, cosine)",2.255230e+12
65,"(17, cosine)",2.259429e+12
49,"(13, cosine)",2.268378e+12
...,...,...
192,"(49, euclidean)",3.199477e+12
1,"(1, cosine)",3.469934e+12
0,"(1, euclidean)",3.601305e+12
3,"(1, manhattan)",3.743214e+12


## Random Grid search con 2 hiper parametros

In [None]:
from sklearn.utils import resample

In [None]:
Grid_Search_Metric_list , Search_Space_1_list, Search_Space_2_list = [] , [] , []

Search_Space_1 = range(1,1000)

Search_Space_2 = ['euclidean','cosine','cityblock','manhattan']

hyperparameter_combinations = list( itertools.product(Search_Space_1, Search_Space_2) )

hyperparameter_combinations = resample(hyperparameter_combinations, n_samples=150, replace=False, random_state=123)

for k in hyperparameter_combinations :

    Search_Space_1_list.append(k[0])  # n_neighbors
    Search_Space_2_list.append(k[1])  # metric

    knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k[0] ,  metric=k[1])

    Grid_Search_Metric_list.append( repeted_random_simple_validation(D=Data, k=0.75, B=50, response='price', random_seed=123) )

In [None]:
df = pd.DataFrame({'k': Search_Space_1_list, 'Distance': Search_Space_2_list, 'ECM': Grid_Search_Metric_list})

In [None]:
df.sort_values(by='ECM')

Unnamed: 0,k,Distance,ECM
74,14,cosine,2.249664e+12
8,15,cosine,2.256147e+12
0,17,manhattan,2.260744e+12
56,18,manhattan,2.264480e+12
69,12,cosine,2.266248e+12
...,...,...,...
35,48,cityblock,3.168145e+12
81,48,manhattan,3.168145e+12
28,49,manhattan,3.184855e+12
2,49,cityblock,3.184855e+12


## Grid search como metodo de optimizacion de funciones matematicas

In [None]:
values = []

for x in range(-99999 , 99999) :

    values.append( x**2 )


In [None]:
df = pd.DataFrame({'x':range(-99999 , 99999)  , 'f(x)':values})

In [None]:
df.sort_values(by='f(x)')

Unnamed: 0,x,f(x)
99999,0,0
99998,-1,1
100000,1,1
100001,2,4
99997,-2,4
...,...,...
2,-99997,9999400009
199996,99997,9999400009
199997,99998,9999600004
1,-99998,9999600004


In [None]:
values , Search_Space_list = [] , []

Search_Space_1 = range(-100 , 100)

Search_Space_2 = range(-100 , 100)

hyperparameter_combinations = list( itertools.product(Search_Space_1, Search_Space_2) )

for x in hyperparameter_combinations :

    Search_Space_list.append(x)

    values.append( x[0]**2 + x[1]**2)


In [None]:
df = pd.DataFrame({'(x_1,x_2)':Search_Space_list , 'f(x_1,x_2)':values})

In [None]:
df.sort_values(by='f(x_1,x_2)', ascending=True )

Unnamed: 0,"(x_1,x_2)","f(x_1,x_2)"
20100,"(0, 0)",0
20101,"(0, 1)",1
20300,"(1, 0)",1
20099,"(0, -1)",1
19900,"(-1, 0)",1
...,...,...
39800,"(99, -100)",19801
200,"(-99, -100)",19801
199,"(-100, 99)",19801
1,"(-100, -99)",19801
