In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import math
import sklearn
import itertools

from sklearn.utils import resample
from sklearn.neighbors import NearestNeighbors

In [3]:
Data = pd.read_csv('House_Price_Regression.csv')

Data = Data.loc[:, ['latitude', 'longitude', 'no_of_bathrooms', 'no_of_bedrooms', 'price', 'size_in_m_2', 'balcony_recode', 'private_garden_recode', 'quality_recode']]

Data.head()

Unnamed: 0,latitude,longitude,no_of_bathrooms,no_of_bedrooms,price,size_in_m_2,balcony_recode,private_garden_recode,quality_recode
0,25.113208,55.138932,2,1,2700000,100.242337,1.0,0.0,2.0
1,25.106809,55.151201,2,2,2850000,146.972546,1.0,0.0,2.0
2,25.063302,55.137728,5,3,1150000,181.253753,1.0,0.0,2.0
3,25.227295,55.341761,3,2,2850000,187.66406,1.0,0.0,1.0
4,25.114275,55.139764,1,0,1729200,47.101821,0.0,0.0,2.0


## Algoritmos de validación

In [49]:
def simple_validation_random(D, k, response, random_seed, metric, model):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # response --> have to be a string with the name of the response variable.

    # random_seed --> seed to replicate the random process

    N = len(D)

    D_train = D.sample(frac=k, replace=False, random_state=random_seed)

    D_test = D.drop( D_train.index , )

    X_train = D_train.loc[: , D_train.columns != response]
    Y_train = D_train.loc[: , response]

    X_test = D_test.loc[: , D_test.columns != response]
    Y_test = D_test.loc[: , response]

############################################################################

    # Training the model wit train sample

    model.fit(X_train, Y_train)

    # Making predictions with test sample

    Y_predict_test = model.predict( X_test ) 

####################################################################

 # Computing the test metric

    if metric == 'ECM' :  
        
        ECM_test = np.mean( (Y_predict_test - Y_test)**2 )

        return ECM_test

    elif metric == 'TA' :  
        
        TA_test = np.mean( (Y_predict_test == Y_test) )

        return TA_test

In [50]:
def repeated_random_simple_validation(D, k, B, response, random_seed, metric, model):

    # D --> have to be a pandas data frame.

    # k --> is the proportion of observation of D that define D_train.

    # B --> number of replications of the random simple validation algorithm

    # response --> have to be a string with the name of the response variable.

    # random_seed --> seed to replicate the random process


    np.random.seed(random_seed)

    ECM_test_list , TA_test_list = [ ] , [ ]

    seed_array = np.random.randint(9999999, size=(B))


    if metric == 'ECM':

        for b in range(0,B) :

            ECM_test_list.append( simple_validation_random(D, k, response, random_seed=seed_array[b], metric=metric, model=model) )


        ECM_test = np.mean(ECM_test_list)    

        return ECM_test 


    elif metric == 'TA':

        for b in range(0,B) :

            TA_test_list.append( simple_validation_random(D, k, response, random_seed=seed_array[b], metric=metric, model=model) )


        TA_test = np.mean(TA_test_list)    

        return TA_test 

In [76]:
def repeated_K_Fold_CV(D, B, K, response, random_seed, metric, model):


    ECM_Repeated_K_Folds_vector , TA_Repeated_K_Folds_vector = [] , []

    np.random.seed(random_seed)


    for b in range(0, B):

        sample = resample(range(0, len(D)), n_samples=len(D), replace=False)

        df_sample = pd.DataFrame({'index': range(0,len(D)) , 'sample':sample})

        
        Q = []

        
        for q in np.arange(0 , 1 + 1/K , 1/K):

            Q.append( np.quantile( range(0, len(D)) , q ).round(0) )



        ECM_K_FOLDS_vector , TA_K_FOLDS_vector = [] , []

        for j in range(0, len(Q)-1):

            X_test = D.loc[df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] , D.columns != response ] 
            Y_test = D.loc[df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] , D.columns == response ]

            X_train = D.loc[ : , D.columns != response ].drop(df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'] )
            Y_train = D.loc[ : ,  D.columns == response ].drop(df_sample.loc[Q[j]:(math.floor(Q[j+1])-1), 'sample'])      

            Y_test = Y_test.to_numpy()

        #######################################################################
            
            # Training the model wit train sample

            model.fit(X_train, Y_train)

            # Making predictions with test sample

            Y_predict_test = model.predict( X_test )   

        #######################################################################

            if metric == 'ECM' :  ECM_K_FOLDS_vector.append( np.mean( ( Y_predict_test - Y_test )**2 ) )

            elif metric == 'TAC' :  TA_K_FOLDS_vector.append( np.mean( ( Y_predict_test == Y_test ) ) )

    
    #######################################################################
    
        if metric == 'ECM' : ECM_Repeated_K_Folds_vector.append( np.mean(ECM_K_FOLDS_vector) )

        elif metric == 'TAC' : TA_Repeated_K_Folds_vector.append( np.mean(TA_K_FOLDS_vector) )

##########################################################################################################################

    if metric == 'ECM' :
        
        ECM_Repeated_K_Folds = np.mean(ECM_Repeated_K_Folds_vector)

        return  ECM_Repeated_K_Folds
        

    elif metric == 'TAC' :
        
        TA_Repeated_K_Folds = np.mean(TA_Repeated_K_Folds_vector)

        return  TA_Repeated_K_Folds
    
    

In [32]:
knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=10 ,  p=2, metric='minkowski')

In [35]:
repeated_K_Fold_CV(D=Data, response='price', K=10, B=100, random_seed=123, metric='ECM', model=knn_regression)

2252099370675.714

In [9]:
knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=5, p=2, metric='minkowski')

In [21]:
repeated_K_Fold_CV(D=Data, response='price', K=10, B=100, random_seed=123, metric='ECM', model=knn_regression)

2396045341013.2305

## Grid search 

In [77]:
def Grid_search(Data, Search_Space, response, model, validation, metric, B, k, K, random_seed_2, random_search, random_seed_1, random_samples):

   Grid_Search_Metric_list = []

   hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

   if random_search == True : 
      
      hyperparameter_combinations = resample(hyperparameter_combinations, n_samples=random_samples, replace=False, random_state=random_seed_1)
   
   else : 
      
      pass


   if model == 'knn_regression' :

      if validation == 'repeated_random_simple_validation' :

         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0] ,  metric=h[1]) 

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_random_simple_validation(Data, k, B, response, random_seed_2, metric, model=knn_regression) )

      #######################################################################   
          
      if validation == 'repeated_K_Fold_CV' :
            
         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0], metric=h[1]) 

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_K_Fold_CV(Data, B, K, response, random_seed_2, metric, model=knn_regression) )


####################################################################################

   if model == 'knn_classification' :

      if validation == 'repeated_random_simple_validation' :

         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_classification = sklearn.neighbors.KNeighborsClassifier(n_neighbors=h[0],  metric=h[1]) 

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_random_simple_validation(Data, k, B, response, random_seed_2, metric, model=knn_classification) )

            #######################################################################   
          
      if validation == 'repeated_K_Fold_CV' :
            
         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_classification = sklearn.neighbors.KNeighborsClassifier(n_neighbors=h[0] ,  metric=h[1]) 

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_K_Fold_CV(Data, B, K, response, random_seed_2, metric, model=knn_classification) )


####################################################################################

   df = pd.DataFrame({'(k , distance)': hyperparameter_combinations, metric: Grid_Search_Metric_list})

   if metric == 'ECM' :

      df = df.sort_values(by=metric, ascending=True)

   elif metric == 'TAC' :

      df = df.sort_values(by=metric, ascending=False)


   return df

In [68]:
Grid_search(Data=Data, Search_Space=[range(1,15) , ['euclidean','cosine','cityblock','manhattan']], response='price', model='knn_regression', validation='repeated_K_Fold_CV', metric='ECM', B=2, k='no', K=10, random_search=False, random_samples=150, random_seed_1=123, random_seed_2=123)

Unnamed: 0,"(k , distance)",ECM
27,"(7, manhattan)",2219238000000.0
26,"(7, cityblock)",2219238000000.0
53,"(14, cosine)",2223631000000.0
32,"(9, euclidean)",2223832000000.0
24,"(7, euclidean)",2224818000000.0
31,"(8, manhattan)",2226414000000.0
30,"(8, cityblock)",2226414000000.0
49,"(13, cosine)",2227142000000.0
50,"(13, cityblock)",2228670000000.0
51,"(13, manhattan)",2228670000000.0


In [78]:
Grid_search(Data=Data, Search_Space=[range(1,15) , ['euclidean','cosine','cityblock','manhattan']], response='quality_recode', model='knn_classification', validation='repeated_K_Fold_CV', metric='TAC', B=2, k='no', K=10, random_search=False, random_samples=150, random_seed_1=123, random_seed_2=123)

Unnamed: 0,"(k , distance)",TAC
52,"(14, euclidean)",0.553324
55,"(14, manhattan)",0.552817
54,"(14, cityblock)",0.552817
49,"(13, cosine)",0.551336
53,"(14, cosine)",0.550558
48,"(13, euclidean)",0.549644
51,"(13, manhattan)",0.549039
50,"(13, cityblock)",0.549039
44,"(12, euclidean)",0.545825
47,"(12, manhattan)",0.544734


In [34]:
def Grid_search(Data, Search_Space, response, model_name, validation, metric, B, k, K, random_seed_2, random_search, random_seed_1, random_samples):

   Grid_Search_Metric_list = []

   hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

   if random_search == True : 
      
      hyperparameter_combinations = resample(hyperparameter_combinations, n_samples=random_samples, replace=False, random_state=random_seed_1)
   
   else : 
      
      pass


   if model_name == 'knn_regression' :

      if validation == 'repeated_K_Fold_CV' :
            
         for h in hyperparameter_combinations :
            
            # Setting the hyperparameters of the model

            knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0], metric=h[1]) 

            # Applying a validation algorithm on the model  

            Grid_Search_Metric_list.append( repeated_K_Fold_CV(B=B, K=K, response=response, random_seed=random_seed_2, metric=metric, model=knn_regression, D=Data) )

  
   return Grid_Search_Metric_list

In [36]:
Grid_search(Data=Data, Search_Space=[range(2,15) , ['euclidean','cosine','cityblock','manhattan']], response='price', model_name='knn_regression', validation='repeated_K_Fold_CV', metric='ECM', B=2, k='no', K=10, random_search=False, random_samples=150, random_seed_1=123, random_seed_2=123)

NameError: name 'knn_classification' is not defined

In [57]:
Grid_Search_Metric_list = []

In [8]:
Search_Space=[range(1,15) , ['euclidean','cosine','cityblock','manhattan']]

In [9]:
hyperparameter_combinations = list( itertools.product(Search_Space[0], Search_Space[1]) )

In [61]:
def prueba(Data) :
    
    for h in hyperparameter_combinations :
            
        # Setting the hyperparameters of the model

        knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0], metric=h[1]) 

            # Applying a validation algorithm on the model  

        Grid_Search_Metric_list.append( repeated_K_Fold_CV(D=Data, response='price', K=10, B=2, random_seed=123, metric='ECM', model=knn_regression) )

    return Grid_Search_Metric_list


In [62]:
prueba(Data)

[2305430965671.924,
 2477111208095.8047,
 2556057153211.9136,
 2556057153211.9136,
 2034801901283.067,
 1996661041687.1255,
 1876001314138.0654,
 1876001314138.0654,
 1831316193660.9766,
 1926514737507.2744,
 1728803785240.1074,
 1728803785240.1074,
 1809848633231.8015,
 1649060633688.2112,
 1766691234985.7373,
 1766691234985.7373,
 1873484422845.3413,
 1616191110850.0898,
 1668499562114.633,
 1668499562114.633,
 1700153886884.0046,
 1734012570473.8662,
 1662408332253.9336,
 1662408332253.9336,
 1603379799447.4023,
 1617275839420.4934,
 1603149315340.9504,
 1603149315340.9504,
 1607073848244.0537,
 1520222886017.0193,
 1581814852452.4316,
 1581814852452.4316,
 1626771874945.683,
 1550872817243.0571,
 1585170706627.1313,
 1585170706627.1313,
 1626177482082.9675,
 1561387533447.0625,
 1572859404216.3606,
 1572859404216.3606,
 1643991645767.6782,
 1610738123835.3137,
 1606575528917.1885,
 1606575528917.1885,
 1665579323002.7197,
 1608822106674.9797,
 1622211259832.1704,
 1622211259832.170

In [40]:
for h in hyperparameter_combinations :
            
        # Setting the hyperparameters of the model

        knn_regression = sklearn.neighbors.KNeighborsRegressor(n_neighbors=h[0], metric=h[1]) 

        # Applying a validation algorithm on the model  

        Grid_Search_Metric_list.append( repeated_K_Fold_CV(D=Data, response='price', K=10, B=2, random_seed=123, metric='ECM', model=knn_regression) )

In [39]:
Grid_Search_Metric_list

[KNeighborsRegressor(metric='euclidean', n_neighbors=1),
 KNeighborsRegressor(metric='cosine', n_neighbors=1),
 KNeighborsRegressor(metric='cityblock', n_neighbors=1),
 KNeighborsRegressor(metric='manhattan', n_neighbors=1),
 KNeighborsRegressor(metric='euclidean', n_neighbors=2),
 KNeighborsRegressor(metric='cosine', n_neighbors=2),
 KNeighborsRegressor(metric='cityblock', n_neighbors=2),
 KNeighborsRegressor(metric='manhattan', n_neighbors=2),
 KNeighborsRegressor(metric='euclidean', n_neighbors=3),
 KNeighborsRegressor(metric='cosine', n_neighbors=3),
 KNeighborsRegressor(metric='cityblock', n_neighbors=3),
 KNeighborsRegressor(metric='manhattan', n_neighbors=3),
 KNeighborsRegressor(metric='euclidean', n_neighbors=4),
 KNeighborsRegressor(metric='cosine', n_neighbors=4),
 KNeighborsRegressor(metric='cityblock', n_neighbors=4),
 KNeighborsRegressor(metric='manhattan', n_neighbors=4),
 KNeighborsRegressor(metric='euclidean'),
 KNeighborsRegressor(metric='cosine'),
 KNeighborsRegresso

In [37]:
Grid_search_new(Data)

NameError: name 'knn_classification' is not defined

In [44]:
Grid_Search_Metric_list

[3262253051198.6616,
 3754258031261.9775,
 3429051945970.085,
 3429051945970.085,
 2724139243468.454,
 2899908308714.6875,
 2725607742855.587,
 2725607742855.587,
 2469800163385.305,
 2669454689838.459,
 2462223492641.7207,
 2462223492641.7207,
 2427819840342.958,
 2463900452096.6694,
 2466536755253.9873,
 2466536755253.9873,
 2365101838778.67,
 2383875252669.0845,
 2267152397570.5186,
 2267152397570.5186,
 2267878303436.13,
 2315524445354.6055,
 2250955076563.5625,
 2250955076563.5625,
 2224818371131.412,
 2310911799872.673,
 2219238115331.2227,
 2219238115331.2227,
 2230854907763.1006,
 2236940076321.787,
 2226413860924.6816,
 2226413860924.6816,
 2223831902934.034,
 2234147796155.2744,
 2237509268473.962,
 2237509268473.962,
 2253176066130.581,
 2244860346640.6035,
 2260754350539.7544,
 2260754350539.7544,
 2244499852529.2715,
 2231224803610.675,
 2249842327775.0464,
 2249842327775.0464,
 2244458657647.374,
 2235335627960.6846,
 2241451251273.9883,
 2241451251273.9883,
 225387925028

In [11]:
df_Grid_Search = Grid_search(Data=Data, Search_Space=[range(1,200) , ['euclidean','cosine','cityblock','manhattan']], response='price', model='knn_regression', validation='repeated_random_simple_validation', metric='ECM', B=10, k=0.75, K='no', random_search=False, random_samples=150, random_seed_1=123, random_seed_2=123)

Time: 2.22 mins

In [12]:
df_Grid_Search

Unnamed: 0,"(k , distance)",ECM
57,"(15, cosine)",2.064764e+12
61,"(16, cosine)",2.067621e+12
65,"(17, cosine)",2.079897e+12
53,"(14, cosine)",2.086479e+12
58,"(15, cityblock)",2.088501e+12
...,...,...
791,"(198, manhattan)",4.357455e+12
788,"(198, euclidean)",4.359118e+12
794,"(199, cityblock)",4.362609e+12
795,"(199, manhattan)",4.362609e+12


In [60]:
df_Grid_Search = Grid_search(Data=Data, Search_Space=[range(1,100) , ['euclidean','cosine','cityblock','manhattan']], response='price', model='knn_regression', validation='repeated_random_simple_validation', metric='ECM', B=10, k=0.75, K='no', random_search=False, random_samples=150, random_seed_1=123, random_seed_2=123)

Time: 25.3 seg

In [61]:
df_Grid_Search

Unnamed: 0,"(k , distance)",ECM
57,"(15, cosine)",2.064764e+12
61,"(16, cosine)",2.067621e+12
65,"(17, cosine)",2.079897e+12
53,"(14, cosine)",2.086479e+12
58,"(15, cityblock)",2.088501e+12
...,...,...
395,"(99, manhattan)",3.470547e+12
392,"(99, euclidean)",3.471920e+12
1,"(1, cosine)",3.646170e+12
2,"(1, cityblock)",3.677001e+12


In [68]:
df_Grid_Search = Grid_search(Data=Data, Search_Space=[range(1,10) , ['euclidean','cosine','cityblock','manhattan']], response='price', model='knn_regression', validation='repeated_K_Fold_CV', metric='ECM', B=2, k='no', K=10, random_search=False, random_samples=150, random_seed_1=123, random_seed_2=123)

Time: 

In [69]:
df_Grid_Search

Unnamed: 0,"(k , distance)",ECM
0,"(1, euclidean)",4074215000000.0
20,"(6, euclidean)",4074215000000.0
21,"(6, cosine)",4074215000000.0
22,"(6, cityblock)",4074215000000.0
23,"(6, manhattan)",4074215000000.0
24,"(7, euclidean)",4074215000000.0
25,"(7, cosine)",4074215000000.0
19,"(5, manhattan)",4074215000000.0
26,"(7, cityblock)",4074215000000.0
28,"(8, euclidean)",4074215000000.0


In [36]:
df_Grid_Search = Grid_search(Data=Data, Search_Space=[range(1,200) , ['euclidean','cosine','cityblock','manhattan']], response='price', model='knn_regression', validation='repeated_K_Fold_CV', metric='ECM', B=2, k='no', K=10, random_search=True, random_samples=15, random_seed_1=123, random_seed_2=123)

In [37]:
df_Grid_Search

Unnamed: 0,"(k , distance)",ECM
0,"(69, cityblock)",2253176000000.0
1,"(34, cityblock)",2253176000000.0
2,"(184, manhattan)",2253176000000.0
3,"(129, euclidean)",2253176000000.0
4,"(151, euclidean)",2253176000000.0
5,"(138, cityblock)",2253176000000.0
6,"(26, euclidean)",2253176000000.0
7,"(45, cityblock)",2253176000000.0
8,"(59, euclidean)",2253176000000.0
9,"(128, euclidean)",2253176000000.0


# Ajuste de hiperparametros con `Sklearn`

## Grid search como metodo de optimizacion de funciones matematicas

In [11]:
values = []

for x in range(-99999 , 99999) :

    values.append( x**2 )


In [12]:
df = pd.DataFrame({'x':range(-99999 , 99999)  , 'f(x)':values})

In [13]:
df.sort_values(by='f(x)')

Unnamed: 0,x,f(x)
99999,0,0
99998,-1,1
100000,1,1
100001,2,4
99997,-2,4
...,...,...
2,-99997,9999400009
199996,99997,9999400009
199997,99998,9999600004
1,-99998,9999600004


In [14]:
values , Search_Space_list = [] , []

Search_Space_1 = range(-100 , 100)

Search_Space_2 = range(-100 , 100)

hyperparameter_combinations = list( itertools.product(Search_Space_1, Search_Space_2) )

for x in hyperparameter_combinations :

    Search_Space_list.append(x)

    values.append( x[0]**2 + x[1]**2)


In [15]:
df = pd.DataFrame({'(x_1,x_2)':Search_Space_list , 'f(x_1,x_2)':values})

In [16]:
df.sort_values(by='f(x_1,x_2)', ascending=True )

Unnamed: 0,"(x_1,x_2)","f(x_1,x_2)"
20100,"(0, 0)",0
20101,"(0, 1)",1
20300,"(1, 0)",1
20099,"(0, -1)",1
19900,"(-1, 0)",1
...,...,...
39800,"(99, -100)",19801
200,"(-99, -100)",19801
199,"(-100, 99)",19801
1,"(-100, -99)",19801
