# Entrenamiento de regresores

Objetivo: Para cada hexágono encontrar un valor perceptual.

Cada hexágono tiene:

*  N>=3, donde N es el número mínimo de imágenes
*  R=10, donde R es la resolución de cada hexágono. Con un área promedio por hexágono de 0.015km2 (15,047.5m2) y largo promedio de cada lado 0.076km
* Number of regions con estas características: 24220



### Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install srai[all]
!pip install contextily
!pip install alphashape

Collecting srai[all]
  Downloading srai-0.7.5-py3-none-any.whl.metadata (20 kB)
Collecting h3>=4.0.0b1 (from srai[all])
  Downloading h3-4.0.0b5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.1 kB)
Collecting geoparquet (from srai[all])
  Downloading geoparquet-0.0.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyfunctional (from srai[all])
  Downloading pyfunctional-1.5.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rtree (from srai[all])
  Downloading Rtree-1.3.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.1 kB)
Collecting topojson (from srai[all])
  Downloading topojson-1.9-py3-none-any.whl.metadata (3.8 kB)
Collecting s2 (from srai[all])
  Downloading s2-0.1.9-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting h3ronpy>=0.18.0 (from srai[all])
  Downloading h3ronpy-0.21.0-cp38-abi3-manylinux_2_17_x86_64.manylinu

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point
import zipfile
import srai
import os
from PIL import Image
import glob
import contextily as ctx
import alphashape


from sklearn.metrics import mean_squared_error, median_absolute_error,r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import scipy.stats
import pickle
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.neural_network import MLPRegressor


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#To create embeddings
from srai.loaders import OSMOnlineLoader, OSMWayLoader, OSMPbfLoader
from srai.regionalizers import geocode_to_region_gdf, S2Regionalizer
from srai.plotting import plot_regions, plot_numeric_data
from srai.embedders import CountEmbedder, ContextualCountEmbedder,Hex2VecEmbedder, Highway2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders.osm_loaders.filters import HEX2VEC_FILTER
from srai.neighbourhoods.h3_neighbourhood import H3Neighbourhood
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Create dataset and training/testing sets

### Df embeddings

In [None]:
df_embeddings = pd.read_csv('/content/drive/MyDrive/UC-TESIS/data/embeddings/h3_embeddings_150_75_100.csv')
df_embeddings

Unnamed: 0,region_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,8ab2c54614b7fff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
1,8ab2c5409c37fff,-0.356221,0.336155,0.056644,-0.032715,0.185949,-0.196175,-0.233436,0.125683,0.293554,...,-0.012782,0.278413,0.040773,0.042589,-0.320704,-0.143026,-0.161286,-0.083865,-0.558367,-0.020263
2,8ab2c5735c27fff,0.511298,0.475341,0.060442,0.396583,-0.291473,0.012991,-0.526473,0.214151,0.395182,...,0.071864,-0.055179,0.182469,0.402972,-0.274875,-0.136615,-0.433164,-0.104150,0.073335,0.634614
3,8ab2c5470d37fff,-0.154229,-0.212532,-0.277868,0.359974,-0.432614,-0.131339,-0.035649,-0.234396,0.266802,...,0.030813,-0.076295,0.212762,-0.210441,-0.170993,0.012732,-0.051932,0.190558,0.100510,-0.181921
4,8ab2c51982d7fff,0.117171,-0.137542,0.248391,0.000995,-0.488907,-0.224059,0.048757,-0.305915,0.187887,...,-0.703700,-0.091194,0.196549,-0.125396,-0.249640,-0.063369,0.138847,0.637275,-0.073419,0.112756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24215,8ab2c5440607fff,0.129073,-0.313071,-0.266625,0.206505,0.424100,-0.071679,-0.106600,-0.243351,-0.110525,...,0.437696,0.314795,0.128654,-0.177794,0.288626,-0.032735,0.041986,0.082394,0.089014,-0.192874
24216,8ab2c5735ba7fff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
24217,8ab2c519132ffff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
24218,8ab2c5573517fff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010


In [None]:
df_embeddings['region_id'].nunique()

24220

In [None]:
df_embeddings = df_embeddings.set_index('region_id')
df_embeddings

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8ab2c54614b7fff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,0.071382,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c5409c37fff,-0.356221,0.336155,0.056644,-0.032715,0.185949,-0.196175,-0.233436,0.125683,0.293554,0.491351,...,-0.012782,0.278413,0.040773,0.042589,-0.320704,-0.143026,-0.161286,-0.083865,-0.558367,-0.020263
8ab2c5735c27fff,0.511298,0.475341,0.060442,0.396583,-0.291473,0.012991,-0.526473,0.214151,0.395182,0.211956,...,0.071864,-0.055179,0.182469,0.402972,-0.274875,-0.136615,-0.433164,-0.104150,0.073335,0.634614
8ab2c5470d37fff,-0.154229,-0.212532,-0.277868,0.359974,-0.432614,-0.131339,-0.035649,-0.234396,0.266802,0.241064,...,0.030813,-0.076295,0.212762,-0.210441,-0.170993,0.012732,-0.051932,0.190558,0.100510,-0.181921
8ab2c51982d7fff,0.117171,-0.137542,0.248391,0.000995,-0.488907,-0.224059,0.048757,-0.305915,0.187887,-0.228582,...,-0.703700,-0.091194,0.196549,-0.125396,-0.249640,-0.063369,0.138847,0.637275,-0.073419,0.112756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8ab2c5440607fff,0.129073,-0.313071,-0.266625,0.206505,0.424100,-0.071679,-0.106600,-0.243351,-0.110525,0.429839,...,0.437696,0.314795,0.128654,-0.177794,0.288626,-0.032735,0.041986,0.082394,0.089014,-0.192874
8ab2c5735ba7fff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,0.071382,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c519132ffff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,0.071382,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c5573517fff,-0.023930,0.040677,-0.017497,-0.275375,0.103468,0.078189,-0.061404,0.082823,0.071790,0.071382,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010


### Df variables perceptuales

In [None]:
df = pd.read_csv('/content/drive/MyDrive/UC-TESIS/data/vars_perceptuales_santiago.csv')
df

Unnamed: 0,latlong,beautiful,boring,depressing,lively,safe,wealthy,lat,lon
0,"-33.323944,-70.51263428391168",-0.306948,1.565049,0.572029,-1.137733,-0.120456,-0.561887,-33.323944,-70.512634
1,"-33.323944,-70.5127291",-0.421388,0.309495,0.368965,-0.098733,-0.103042,-0.162294,-33.323944,-70.512729
2,"-33.323944,-70.51298714285714",0.116505,0.164284,-0.110312,0.063860,0.391172,0.226372,-33.323944,-70.512987
3,"-33.323944,-70.51343609999999",-0.159113,-0.500987,-0.213503,0.635165,0.300856,0.453708,-33.323944,-70.513436
4,"-33.323944,-70.51379769565217",-1.226162,1.176751,1.462015,-0.842954,-0.946355,-0.936168,-33.323944,-70.513798
...,...,...,...,...,...,...,...,...,...
121346,"-33.67884090851735,-70.68059514195582",-1.318599,2.201657,1.673377,-2.265229,-1.677807,-2.014101,-33.678841,-70.680595
121347,"-33.67884090851735,-70.69912023659306",0.827014,1.129426,-0.431715,-1.053424,0.438107,0.078089,-33.678841,-70.699120
121348,"-33.67884090851735,-70.70653027444796",0.975725,0.838188,-0.583064,-0.515637,0.957951,0.547197,-33.678841,-70.706530
121349,"-33.67884090851735,-70.7176453312303",-0.629273,0.780489,0.881986,-1.363918,-1.328026,-1.726176,-33.678841,-70.717645


In [None]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
gdf = gdf.set_crs(epsg=4326)
gdf_rm= gdf.copy().drop(columns=['lat', 'lon', 'latlong'])
gdf_rm


Unnamed: 0,beautiful,boring,depressing,lively,safe,wealthy,geometry
0,-0.306948,1.565049,0.572029,-1.137733,-0.120456,-0.561887,POINT (-70.51263 -33.32394)
1,-0.421388,0.309495,0.368965,-0.098733,-0.103042,-0.162294,POINT (-70.51273 -33.32394)
2,0.116505,0.164284,-0.110312,0.063860,0.391172,0.226372,POINT (-70.51299 -33.32394)
3,-0.159113,-0.500987,-0.213503,0.635165,0.300856,0.453708,POINT (-70.51344 -33.32394)
4,-1.226162,1.176751,1.462015,-0.842954,-0.946355,-0.936168,POINT (-70.51380 -33.32394)
...,...,...,...,...,...,...,...
121346,-1.318599,2.201657,1.673377,-2.265229,-1.677807,-2.014101,POINT (-70.68060 -33.67884)
121347,0.827014,1.129426,-0.431715,-1.053424,0.438107,0.078089,POINT (-70.69912 -33.67884)
121348,0.975725,0.838188,-0.583064,-0.515637,0.957951,0.547197,POINT (-70.70653 -33.67884)
121349,-0.629273,0.780489,0.881986,-1.363918,-1.328026,-1.726176,POINT (-70.71765 -33.67884)


Create region_id column

In [None]:
# Extraer los puntos de la columna 'geometry'
points = list(gdf_rm.geometry)
points = np.array([[point.x, point.y] for point in gdf_rm.geometry])
# Calcular el alpha shape
alpha = 100
alpha_shape = alphashape.alphashape(points, alpha)

# Crear un nuevo GeoDataFrame con el alpha shape
gdf_alpha_shape = gpd.GeoDataFrame(geometry=[alpha_shape])
gdf_alpha_shape['region_id'] = "Santiago Metropolitan Region, Chile"
gdf_alpha_shape = gdf_alpha_shape.set_crs(epsg=4326)
study_area2 = gdf_alpha_shape
study_area2

Unnamed: 0,geometry,region_id
0,"MULTIPOLYGON (((-70.76705 -33.67659, -70.76581...","Santiago Metropolitan Region, Chile"


In [None]:
regionalizer = H3Regionalizer(resolution=10, buffer=True)
regions_gdf_rm_10 = regionalizer.transform(study_area2)
regions_gdf_rm_10_no_index= regions_gdf_rm_10.reset_index()
regions_gdf_rm_10_no_index

Unnamed: 0,region_id,geometry
0,8ab2c546d727fff,"POLYGON ((-70.60364 -33.54995, -70.60427 -33.5..."
1,8ab2c55046cffff,"POLYGON ((-70.82357 -33.35232, -70.82421 -33.3..."
2,8ab2c5181d8ffff,"POLYGON ((-70.50459 -33.35980, -70.50522 -33.3..."
3,8ab2c5571537fff,"POLYGON ((-70.70536 -33.39716, -70.70599 -33.3..."
4,8ab2c5706cd7fff,"POLYGON ((-70.57196 -33.62285, -70.57259 -33.6..."
...,...,...
87105,8ab2c550ea17fff,"POLYGON ((-70.79517 -33.38967, -70.79580 -33.3..."
87106,8ab2c547072ffff,"POLYGON ((-70.73160 -33.53311, -70.73223 -33.5..."
87107,8ab2c519c9a7fff,"POLYGON ((-70.54077 -33.37709, -70.54140 -33.3..."
87108,8ab2c57a4217fff,"POLYGON ((-70.67016 -33.64302, -70.67079 -33.6..."


Add region_id colum with regionalizer function.



In [None]:
#Contiene solo las filas donde ambos gdf  se intersectan
df_perceptual = gpd.sjoin(gdf_rm, regions_gdf_rm_10_no_index, how="inner", predicate="intersects")
df_perceptual.drop(columns=['index_right'], inplace=True)
df_perceptual

Unnamed: 0,beautiful,boring,depressing,lively,safe,wealthy,geometry,region_id
0,-0.306948,1.565049,0.572029,-1.137733,-0.120456,-0.561887,POINT (-70.51263 -33.32394),8ab2c51a2297fff
1,-0.421388,0.309495,0.368965,-0.098733,-0.103042,-0.162294,POINT (-70.51273 -33.32394),8ab2c51a2297fff
2,0.116505,0.164284,-0.110312,0.063860,0.391172,0.226372,POINT (-70.51299 -33.32394),8ab2c51a274ffff
3,-0.159113,-0.500987,-0.213503,0.635165,0.300856,0.453708,POINT (-70.51344 -33.32394),8ab2c51a274ffff
4,-1.226162,1.176751,1.462015,-0.842954,-0.946355,-0.936168,POINT (-70.51380 -33.32394),8ab2c51a274ffff
...,...,...,...,...,...,...,...,...
121346,-1.318599,2.201657,1.673377,-2.265229,-1.677807,-2.014101,POINT (-70.68060 -33.67884),8ab2c57a30affff
121347,0.827014,1.129426,-0.431715,-1.053424,0.438107,0.078089,POINT (-70.69912 -33.67884),8ab2c57849affff
121348,0.975725,0.838188,-0.583064,-0.515637,0.957951,0.547197,POINT (-70.70653 -33.67884),8ab2c5784d37fff
121349,-0.629273,0.780489,0.881986,-1.363918,-1.328026,-1.726176,POINT (-70.71765 -33.67884),8ab2c57b1067fff


### Create final dataset antes de split

* ***Dataset full*** con regiones múltiples y con más de 1 feature por hexágono

In [None]:
#only matching rows from both df
dataset_full = pd.merge(df_perceptual, df_embeddings, on='region_id')
dataset_full = dataset_full.set_index('region_id').drop(columns=['geometry'])
dataset_full

Unnamed: 0_level_0,beautiful,boring,depressing,lively,safe,wealthy,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8ab2c51a274ffff,0.116505,0.164284,-0.110312,0.063860,0.391172,0.226372,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c51a274ffff,-0.159113,-0.500987,-0.213503,0.635165,0.300856,0.453708,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c51a274ffff,-1.226162,1.176751,1.462015,-0.842954,-0.946355,-0.936168,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c51a274ffff,-0.308952,0.491572,0.351956,-0.205486,0.098004,-0.171525,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c51a274ffff,-0.255187,0.108916,0.208224,0.178949,0.217683,0.072019,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8ab2c54dd80ffff,0.077252,0.637953,0.144422,-0.589375,-0.117685,-0.098696,-0.300282,-0.230503,0.195256,-0.364696,...,0.160869,-0.016424,-0.036420,0.376509,-0.010768,-0.141948,-0.526136,0.079622,0.009509,0.082694
8ab2c54dd80ffff,0.688408,0.694112,-0.407948,-0.622452,0.357219,0.302788,-0.300282,-0.230503,0.195256,-0.364696,...,0.160869,-0.016424,-0.036420,0.376509,-0.010768,-0.141948,-0.526136,0.079622,0.009509,0.082694
8ab2c57a268ffff,-0.065934,2.345176,0.678399,-2.201416,-0.623765,-0.908452,-0.095155,0.081839,0.058075,0.198763,...,-0.160243,-0.125825,-0.070056,0.252882,0.131426,-0.226724,-0.199880,0.119445,-0.031587,0.105807
8ab2c57a268ffff,0.411270,1.080804,0.115137,-1.929492,-1.131603,-1.081190,-0.095155,0.081839,0.058075,0.198763,...,-0.160243,-0.125825,-0.070056,0.252882,0.131426,-0.226724,-0.199880,0.119445,-0.031587,0.105807


regiones únicas = 24220 --> dataset.region_id.nunique()

**Problema**: Para cada región (hexágono) tenemos distintas características (beautiful,	boring,	depressing,	lively,	safe	y wealthy) dentro de cada región. ¿Cuál es la mejor manera de ponderar esas características y representar mejor cada región? -->  media aritmética, la geométrica, el máximo, moda, mediana, otro?

In [None]:
dataset_full.index

Index(['8ab2c51a274ffff', '8ab2c51a274ffff', '8ab2c51a274ffff',
       '8ab2c51a274ffff', '8ab2c51a274ffff', '8ab2c51a274ffff',
       '8ab2c51a275ffff', '8ab2c51a275ffff', '8ab2c51a275ffff',
       '8ab2c51a278ffff',
       ...
       '8ab2c57b1d6ffff', '8ab2c57a14d7fff', '8ab2c57a14d7fff',
       '8ab2c57a14d7fff', '8ab2c54dd80ffff', '8ab2c54dd80ffff',
       '8ab2c54dd80ffff', '8ab2c57a268ffff', '8ab2c57a268ffff',
       '8ab2c57a268ffff'],
      dtype='object', name='region_id', length=83728)

In [None]:
dataset_full.loc['8ab2c5012c4ffff']

Unnamed: 0_level_0,beautiful,boring,depressing,lively,safe,wealthy,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8ab2c5012c4ffff,2.422687,-1.120363,-2.311725,1.064008,1.668255,1.819982,0.061735,0.028381,-0.014441,0.140128,...,0.050715,-0.271304,-0.21377,0.192833,0.213708,0.039599,-0.188866,0.170294,0.169785,0.159016
8ab2c5012c4ffff,1.339041,-2.964601,-1.85263,2.74375,1.533512,1.962026,0.061735,0.028381,-0.014441,0.140128,...,0.050715,-0.271304,-0.21377,0.192833,0.213708,0.039599,-0.188866,0.170294,0.169785,0.159016
8ab2c5012c4ffff,1.383741,-3.306867,-1.94752,3.033949,1.619152,2.060544,0.061735,0.028381,-0.014441,0.140128,...,0.050715,-0.271304,-0.21377,0.192833,0.213708,0.039599,-0.188866,0.170294,0.169785,0.159016
8ab2c5012c4ffff,0.975627,0.692605,-0.647154,-0.738592,0.483502,0.307371,0.061735,0.028381,-0.014441,0.140128,...,0.050715,-0.271304,-0.21377,0.192833,0.213708,0.039599,-0.188866,0.170294,0.169785,0.159016


Media aritmética

In [None]:
df_means = dataset_full.groupby('region_id').mean()
df_means

Unnamed: 0_level_0,beautiful,boring,depressing,lively,safe,wealthy,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8ab2c5012c4ffff,1.530274,-1.674806,-1.689757,1.525779,1.326105,1.537481,0.061735,0.028381,-0.014441,0.140128,...,0.050715,-0.271304,-0.213770,0.192833,0.213708,0.039599,-0.188866,0.170294,0.169785,0.159016
8ab2c5080057fff,1.797793,-0.473614,-1.603372,0.481697,1.303100,1.463700,-0.009823,-0.065363,0.021479,-0.529560,...,0.026409,-0.246902,0.014540,-0.198926,-0.076141,-0.011190,0.102937,0.077663,0.060189,-0.084835
8ab2c5080087fff,0.617557,1.008862,-0.127893,-1.063960,-0.006197,-0.005126,-0.049334,0.016889,-0.024865,-0.654026,...,-0.141553,-0.123275,0.003810,-0.035499,-0.403222,0.020489,-0.083732,0.241804,0.027533,-0.298355
8ab2c508009ffff,0.344537,0.689234,-0.113139,-0.690042,-0.006416,-0.023423,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c50800f7fff,0.658131,-0.016540,-0.538743,-0.102794,0.375745,0.415797,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8ab2c5cdeca7fff,-0.422076,1.432125,0.556406,-0.505823,0.275188,-0.190039,-0.195331,-0.210522,-0.279355,0.339834,...,-0.006257,-0.070476,0.206203,-0.153168,-0.100565,-0.045583,-0.066746,0.180717,0.064394,-0.221057
8ab2c5cdecaffff,-0.301907,0.642069,0.367796,-0.402620,-0.067361,-0.244828,-0.152469,-0.207537,-0.275766,0.355418,...,0.027344,-0.072804,0.207733,-0.203336,-0.168068,0.012886,-0.051808,0.189225,0.097021,-0.178744
8ab2c5cdecd7fff,-0.047693,0.280894,0.017444,-0.186770,-0.057487,0.011591,-0.154229,-0.212532,-0.277868,0.359974,...,0.030813,-0.076295,0.212762,-0.210441,-0.170993,0.012732,-0.051932,0.190558,0.100510,-0.181921
8ab2c5cdecdffff,-0.932705,1.460426,1.053305,-1.076747,-0.633851,-0.852946,-0.152469,-0.207537,-0.275766,0.355418,...,0.027344,-0.072804,0.207733,-0.203336,-0.168068,0.012886,-0.051808,0.189225,0.097021,-0.178744


In [29]:
df_means.to_csv('/content/drive/MyDrive/UC-TESIS/data/df_means.csv', index=False)

Media geométrica --> problema? valores nan

In [None]:
df_geometric_means = dataset_full.groupby('region_id').apply(lambda x: scipy.stats.gmean(x, axis=0))
df_geometric_means

  log_a = np.log(a)


Unnamed: 0_level_0,0
region_id,Unnamed: 1_level_1
8ab2c5012c4ffff,"[1.4466298704361675, nan, nan, nan, 1.18962190..."
8ab2c5080057fff,"[1.7164934053267011, nan, nan, nan, 1.28358348..."
8ab2c5080087fff,"[0.5735789661957594, 0.9077705082996049, nan, ..."
8ab2c508009ffff,"[nan, 0.4741728846110169, nan, nan, nan, nan, ..."
8ab2c50800f7fff,"[nan, nan, nan, nan, nan, nan, nan, 0.04067694..."
...,...
8ab2c5cdeca7fff,"[nan, 1.3652880381323635, 0.542970119811581, n..."
8ab2c5cdecaffff,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
8ab2c5cdecd7fff,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
8ab2c5cdecdffff,"[nan, 1.4199495536775772, 1.0392234245913927, ..."


Mediana

In [None]:
df_medians = dataset_full.groupby('region_id').median()
df_medians

Unnamed: 0_level_0,beautiful,boring,depressing,lively,safe,wealthy,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8ab2c5012c4ffff,1.361391,-2.042482,-1.900075,1.903879,1.576332,1.891004,0.061735,0.028381,-0.014441,0.140128,...,0.050715,-0.271304,-0.213770,0.192833,0.213708,0.039599,-0.188866,0.170294,0.169785,0.159016
8ab2c5080057fff,1.604606,-0.781635,-1.509878,0.690671,1.329464,1.497841,-0.009823,-0.065363,0.021479,-0.529560,...,0.026409,-0.246902,0.014540,-0.198926,-0.076141,-0.011190,0.102937,0.077663,0.060189,-0.084835
8ab2c5080087fff,0.534498,1.203662,-0.124734,-0.988998,-0.178066,-0.009102,-0.049334,0.016889,-0.024865,-0.654026,...,-0.141553,-0.123275,0.003810,-0.035499,-0.403222,0.020489,-0.083732,0.241804,0.027533,-0.298355
8ab2c508009ffff,0.523684,0.534084,-0.243849,-0.727690,-0.063302,-0.097517,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
8ab2c50800f7fff,0.338217,-0.086866,-0.254180,-0.074327,0.160953,0.149841,-0.023930,0.040677,-0.017497,-0.275375,...,0.180155,-0.046677,0.126445,-0.061428,-0.006454,-0.030310,0.056697,-0.153097,0.123646,-0.004010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8ab2c5cdeca7fff,-0.426769,1.631752,0.593024,-0.516723,0.275822,-0.118063,-0.195331,-0.210522,-0.279355,0.339834,...,-0.006257,-0.070476,0.206203,-0.153168,-0.100565,-0.045583,-0.066746,0.180717,0.064394,-0.221057
8ab2c5cdecaffff,-0.180813,0.702974,0.187430,-0.467125,-0.044658,-0.363213,-0.152469,-0.207537,-0.275766,0.355418,...,0.027344,-0.072804,0.207733,-0.203336,-0.168068,0.012886,-0.051808,0.189225,0.097021,-0.178744
8ab2c5cdecd7fff,-0.537953,0.905214,0.774207,-0.618593,-0.771503,-0.753049,-0.154229,-0.212532,-0.277868,0.359974,...,0.030813,-0.076295,0.212762,-0.210441,-0.170993,0.012732,-0.051932,0.190558,0.100510,-0.181921
8ab2c5cdecdffff,-0.974256,1.550554,1.053681,-1.071634,-0.626345,-0.882865,-0.152469,-0.207537,-0.275766,0.355418,...,0.027344,-0.072804,0.207733,-0.203336,-0.168068,0.012886,-0.051808,0.189225,0.097021,-0.178744


In [30]:
df_medians.to_csv('/content/drive/MyDrive/UC-TESIS/data/df_medians.csv', index=False)

## Models



### Split

80% for training

20% for testing

*  Matriz X: embeddings OSM hex2vec
*  Vector objetivo y: variables perceptuales de las imágenes. Probar vector objetivo por separado: beautiful y boring
```
# hex2vec columns OSM
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
# Perceptual columns
y = df_means[['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy']]

```



## 1.Random forest


In [None]:
#X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.2, random_state=25)
#y_pred_1 = rf_model_df_means_1.predict(X_test_1)

Probar vector objetivo por separado
  - beautiful
  - boring

10 variaciones del modelo cada uno entrenado y evalado en diferentes random subsets

**cross_val_score**

'neg_mean_squared_error':  especifica que la métrica  para evaluar el modelo en cada fold es el MSE negativo. La razón? cross_val_score espera una función de puntuación donde las puntuaciones más altas son mejores. Como el MSE es una medida de error (valores más bajos son mejores), se niega para alinearlo con la convención.

El signo menos (-) del cross_val_score invierte los valores para obtener el MSE positivo. cv_scores_mse contendrá una matriz de valores MSE para cada fold.

versión rápida

In [None]:
# Dataset
# hex2vec columns OSM
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['beautiful']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

#model.fit(X_train, y_train.values.ravel())
#model.score(X_test, y_test)


0.06384447039093544

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

### 1.1 Random forest con df media, y= beautiful


In [None]:
%%time
# Dataset
# hex2vec columns OSM
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['beautiful']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


comenzó?
Training MSE: 0.2990755039418754 +/- 0.0015920921575398133
Training R2: 0.477019513335118 +/- 0.002867505622326969
Training MAPE: -136.4722317010928 +/- 7.598235807914923
--------------------
Testing MSE: 0.5383165136282135 +/- 0.0031010682261656693
Testing R2: 0.05854525266550774 +/- 0.005942037954608193
Testing MAPE: -222.94139053122944 +/- 36.490511369565446
CPU times: user 10min 47s, sys: 527 ms, total: 10min 48s
Wall time: 10min 53s


media beautiful, matriz x todo menos vector objetivo




In [None]:
%%time
# Dataset
# hex2vec columns OSM
X = df_means.drop(['beautiful'], axis=1)
y = df_means[['beautiful']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.0006461406486378944
Training R2: 0.9988701166054783
Training MAPE: -19.80243618573134
--------------------
Testing MSE: 0.004582040811147992
Testing R2: 0.9919862309687117
Testing MAPE: -53.445674918233046
CPU times: user 7min 32s, sys: 330 ms, total: 7min 32s
Wall time: 7min 30s


### 1.2 Random forest con df media, y= boring

In [None]:
%%time

# Dataset
# hex2vec columns OSM
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['boring']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.22749716527303826
Training R2: 0.5995714113218684
Training MAPE: -168.9882096975019
--------------------
Testing MSE: 0.44503807207197943
Testing R2: 0.21614548591504973
Testing MAPE: -274.608294140321
CPU times: user 8min 11s, sys: 360 ms, total: 8min 11s
Wall time: 8min 9s


media boring, matriz x todo menos vector objetivo


In [None]:
%%time

# hex2vec columns OSM
X = df_means.drop(['boring'], axis=1)
y = df_means[['boring']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.0017129007928285904
Training R2: 0.9969849595100347
Training MAPE: -31.764050107570245
--------------------
Testing MSE: 0.01198772059847137
Testing R2: 0.9788861267159821
Testing MAPE: -81.81927489303503
CPU times: user 7min 40s, sys: 347 ms, total: 7min 40s
Wall time: 7min 38s


### 1.3 Random forest con df mediana, y=beautiful

In [None]:
%%time
# Dataset
# hex2vec columns OSM
X = df_medians.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_medians[['beautiful']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.3350880487219855
Training R2: 0.47693380538351315
Training MAPE: -154.85022124423978
--------------------
Testing MSE: 0.6094812959010738
Testing R2: 0.04846288436441706
Testing MAPE: -221.40668141788927
CPU times: user 7min 40s, sys: 336 ms, total: 7min 40s
Wall time: 7min 38s


mediana beautiful, matriz x todo menos vector objetivo


In [None]:
%%time
# Dataset
# hex2vec columns OSM
X = df_medians.drop(['beautiful'], axis=1)
y = df_medians[['beautiful']]  #y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.0028303437078941035
Training R2: 0.9955818963946633
Training MAPE: -41.250841896543065
--------------------
Testing MSE: 0.02013260221135966
Testing R2: 0.9685652584864073
Testing MAPE: -112.94245068107068
CPU times: user 7min 57s, sys: 368 ms, total: 7min 57s
Wall time: 7min 55s


### 1.4 Random forest con df mediana, y=boring

In [None]:
%%time
# Dataset
# hex2vec columns OSM
X = df_medians.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_medians[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = -cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = -cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.27004648874564585
Training R2: 0.5872991278545594
Training MAPE: 195.4184330499715
--------------------
Testing MSE: 0.5310282134574151
Testing R2: 0.1880893514830976
Testing MAPE: 325.4297824875017
CPU times: user 8min 15s, sys: 354 ms, total: 8min 16s
Wall time: 8min 14s


mediana boring, matriz x todo menos vector objetivo


In [None]:
%%time
# Dataset
# hex2vec columns OSM
X = df_medians.drop(['boring'], axis=1)
y = df_medians[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)
model = RandomForestRegressor(n_estimators=100, random_state=25)

# Define the scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'mape': mape_scorer
}

# Perform cross-validation
cv_results = cross_validate(model, X_train, y_train.values.ravel(), cv=5, scoring=scoring, return_train_score=True)

# Access training scores
train_mse = cv_results['train_mse']
train_r2 = cv_results['train_r2']
train_mape = -cv_results['train_mape']

# Access testing scores
test_mse = cv_results['test_mse']
test_r2 = cv_results['test_r2']
test_mape = -cv_results['test_mape']

# Dado que greater_is_better=False, los scores son negativos, así que multiplicamos por -1
#cv_results = -cv_results

# Print the results
print("Training MSE:", train_mse.mean())
print("Training R2:", train_r2.mean())
print("Training MAPE:", train_mape.mean())
print("--------------------")
print("Testing MSE:", test_mse.mean())
print("Testing R2:", test_r2.mean())
print("Testing MAPE:", test_mape.mean())


Training MSE: 0.007808303651528628
Training R2: 0.9880670310154482
Training MAPE: 64.58186172513442
--------------------
Testing MSE: 0.05595059779769775
Testing R2: 0.9144157383334257
Testing MAPE: 166.1330836945861
CPU times: user 8min 3s, sys: 258 ms, total: 8min 4s
Wall time: 8min 2s


## Métricas

* **R²**  mide qué tan bien el modelo se ajusta a los datos observados, indica la
proporción de la variabilidad en la variable dependiente que es explicada por el modelo.
  - R² de 1 indica un ajuste perfecto, donde el modelo explica toda la variabilidad de los datos.
  - R² de 0 indica que el modelo no explica ninguna variabilidad y es tan bueno como simplemente predecir la media de la variable dependiente.
  - R² negativos indican que el modelo se ajusta peor a los datos que una línea horizontal (la media).

* **MAPE** promedio de los porcentajes de error absoluto entre las predicciones y los valores reales. MAPE más bajo indica un mejor ajuste del modelo a los datos. Ej.MAPE del 10% significa que, en promedio, las predicciones del modelo se desvían un 10% de los valores reales.

`MAPE = (1/n) * Σ(|(Valor Real - Valor Predicho)| / |Valor Real|) * 100%`

    n es el número de observaciones.

    Σ indica la suma de los errores porcentuales absolutos.

* **MSE** mide la precisión de las predicciones especialmente cuando se desea penalizar más los errores grandes. Se calcula como el promedio de los cuadrados de las diferencias entre las predicciones y los valores reales.

`MSE = (1/n) * Σ(Valor Real - Valor Predicho)²`

    n es el número de observaciones.

    Σ indica la suma de los cuadrados de las diferencias





---



---



## 2.SVR

In [35]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV

In [27]:
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

### 2.1 SVR con df media, y= beautiful


SVR Median - beautiful

In [36]:
%%time
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

Grid search




CPU times: user 4min 46s, sys: 176 ms, total: 4min 46s
Wall time: 4min 45s


In [38]:
# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

making predictions
Calculating metrics


In [39]:
metrics_df_scaled

Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.091946,0.519291,2.253059,0.082098,0.51296,2.070631,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


media beautiful, matriz x todo menos vector objetivo

In [43]:
%%time
X = df_means.drop(['beautiful'], axis=1)
y = df_means[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 2min, sys: 57.3 ms, total: 2min 1s
Wall time: 2min


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.994286,0.003268,0.457351,0.993872,0.003424,0.409809,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


### 2.2 SVR con df media, y= boring

In [44]:
%%time
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 5min 23s, sys: 187 ms, total: 5min 24s
Wall time: 5min 22s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.241423,0.43101,2.195015,0.219002,0.435089,2.937845,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


media boring, matriz x todo menos vector objetivo

In [45]:
%%time
X = df_means.drop(['boring'], axis=1)
y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 4min 33s, sys: 164 ms, total: 4min 33s
Wall time: 4min 32s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.990331,0.005494,0.626182,0.990196,0.005462,0.665219,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


### 2.3 SVR con df mediana, y= beautiful

In [46]:
%%time
X = df_medians.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_medians[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 4min 59s, sys: 178 ms, total: 4min 59s
Wall time: 4min 58s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.084712,0.586365,2.26202,0.073011,0.591206,2.239065,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


X full

In [47]:
%%time
X = df_medians.drop(['beautiful'], axis=1)
y = df_medians[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 8min 52s, sys: 391 ms, total: 8min 52s
Wall time: 8min 50s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.970961,0.018603,1.077149,0.969233,0.019622,0.886234,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


### 2.4 SVR con df mediana, y= boring

In [50]:
%%time
X = df_medians.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_medians[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 5min 4s, sys: 216 ms, total: 5min 4s
Wall time: 5min 3s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.216287,0.512842,2.379681,0.187049,0.522004,1.902147,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


x full

In [51]:
%%time
X = df_medians.drop(['boring'], axis=1)
y = df_medians[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simplified grid search parameters
param_grid_simplified = {
    'kernel': ['linear'],
    'C': [1],
    'gamma': ['auto'],
    'epsilon': [0.1]
}

# Initialize the SVR model
svr = SVR()

print("Grid search")
# Grid search with cross-validation
#grid_search_scaled = GridSearchCV(svr, param_grid_simplified, cv=3, scoring='r2')
#grid_search_scaled.fit(X_train_scaled, y_train.values.ravel())

random_search = RandomizedSearchCV(svr, param_distributions=param_grid_simplified, n_iter=10, cv=3, scoring='r2', random_state=25)
random_search.fit(X_train_scaled, y_train.values.ravel())

# Best model
#best_model_scaled = grid_search_scaled.best_estimator_
best_model_scaled = random_search.best_estimator_

print("making predictions")
# Making predictions
y_train_pred_scaled = best_model_scaled.predict(X_train_scaled)
y_test_pred_scaled = best_model_scaled.predict(X_test_scaled)

print("Calculating metrics")
# Calculating metrics
r2_train_scaled = r2_score(y_train, y_train_pred_scaled)
mse_train_scaled = mean_squared_error(y_train, y_train_pred_scaled)
mape_train_scaled = mean_absolute_percentage_error(y_train, y_train_pred_scaled)

r2_test_scaled = r2_score(y_test, y_test_pred_scaled)
mse_test_scaled = mean_squared_error(y_test, y_test_pred_scaled)
mape_test_scaled = mean_absolute_percentage_error(y_test, y_test_pred_scaled)

# Storing metrics in a DataFrame
metrics_scaled = {
    'R2 (Train)': [r2_train_scaled],
    'MSE (Train)': [mse_train_scaled],
    'MAPE (Train)': [mape_train_scaled],
    'R2 (Test)': [r2_test_scaled],
    'MSE (Test)': [mse_test_scaled],
    'MAPE (Test)': [mape_test_scaled],
    'Best Parameters': [random_search.best_params_]
}

metrics_df_scaled = pd.DataFrame(metrics_scaled)
metrics_df_scaled

#import ace_tools as tools; tools.display_dataframe_to_user(name="SVR Model Metrics with Standard Scaling", dataframe=metrics_df_scaled)

Grid search




making predictions
Calculating metrics
CPU times: user 9min 38s, sys: 448 ms, total: 9min 39s
Wall time: 9min 36s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test),Best Parameters
0,0.92983,0.045917,1.51809,0.930068,0.044904,1.399212,"{'kernel': 'linear', 'gamma': 'auto', 'epsilon..."


## 3.MLP

### 3.1 MLP con df media, y= beautiful

In [68]:
%%time
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 5.18 s, sys: 9.3 s, total: 14.5 s
Wall time: 2.12 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.140466,0.491544,2.167723,0.076599,0.516033,2.499319


media beautiful, matriz x todo menos vector objetivo

In [69]:
%%time
X = df_means.drop(['beautiful'], axis=1)
y = df_means[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 17 s, sys: 27.2 s, total: 44.2 s
Wall time: 5.67 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.994604,0.003086,0.436805,0.991849,0.004555,0.473586


### 3.2 MLP con df media, y= boring

In [70]:
%%time
X = df_means.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 5.68 s, sys: 9.4 s, total: 15.1 s
Wall time: 1.94 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.301948,0.396621,2.186733,0.210844,0.439634,3.620486


media boring, matriz x todo menos vector objetivo

In [71]:
%%time
X = df_means.drop(['boring'], axis=1)
y = df_means[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 18.7 s, sys: 38.4 s, total: 57.1 s
Wall time: 8.94 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.990856,0.005195,0.584625,0.987188,0.007137,0.64001


### 3.3 MLP con df mediana, y= beautiful

In [72]:
%%time
X = df_medians.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_medians[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 4.19 s, sys: 7.28 s, total: 11.5 s
Wall time: 1.5 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.134875,0.554229,2.163657,0.062002,0.598227,2.015932


mediana beautiful, matriz x todo menos vector objetivo

In [73]:
%%time
X = df_medians.drop(['beautiful'], axis=1)
y = df_medians[['beautiful']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 11 s, sys: 18.7 s, total: 29.7 s
Wall time: 3.89 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.972859,0.017387,0.991538,0.963007,0.023593,1.001058


### 3.4 MLP con df mediana, y= boring

In [74]:
%%time
X = df_medians.drop(['beautiful', 'boring', 'depressing', 'lively', 'safe', 'wealthy'], axis=1)
y = df_medians[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 8.55 s, sys: 13.8 s, total: 22.4 s
Wall time: 2.86 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.295227,0.461186,2.660396,0.177673,0.528023,2.471492


mediana boring, matriz x todo menos vector objetivo

In [75]:
%%time
X = df_medians.drop(['boring'], axis=1)
y = df_medians[['boring']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Step 1: Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the MLP Model
mlp = MLPRegressor(hidden_layer_sizes=(50,), activation='relu',
                   solver='adam', alpha=0.001,
                   early_stopping=True, validation_fraction=0.1,
                   n_iter_no_change=10,
                   max_iter=1000, random_state=25)

# Step 3: Train the Model
mlp.fit(X_train_scaled, y_train.values.ravel())

# Step 4: Evaluate the Model
y_train_pred = mlp.predict(X_train_scaled)
y_test_pred = mlp.predict(X_test_scaled)

# Calculate metrics for training data
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

# Calculate metrics for testing data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Print or display the metrics
metrics = {
    'R2 (Train)': r2_train,
    'MSE (Train)': mse_train,
    'MAPE (Train)': mape_train,
    'R2 (Test)': r2_test,
    'MSE (Test)': mse_test,
    'MAPE (Test)': mape_test
}

metrics_df = pd.DataFrame([metrics])
metrics_df

CPU times: user 15.3 s, sys: 27.7 s, total: 43 s
Wall time: 6.39 s


Unnamed: 0,R2 (Train),MSE (Train),MAPE (Train),R2 (Test),MSE (Test),MAPE (Test)
0,0.936824,0.041341,1.54345,0.922052,0.050051,1.613023
