In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [4]:
base = pd.read_csv('Belize_datoslimpios.csv')
base.head()

Unnamed: 0.1,Unnamed: 0,source,name,neighborhood_overview,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,...,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,0,city scrape,Mo's Cozy Cabana,The cabana is just a moments walk to 'The Spli...,Monique,03/07/10,Belize,within an hour,100%,90%,...,3.8,0.3,4.82,4.83,4.88,4.0,4.0,0.0,0.0,0.6
1,1,city scrape,"Seaside Villas 4 2nd Flr - Pool, Beach, Jacuzzi","Right on the beach, this building is in a prim...",Bobbi,19/05/11,"Belize City, Belize",within an hour,100%,100%,...,11.0,0.0,4.81,4.91,4.71,19.0,19.0,0.0,0.0,0.73
2,2,city scrape,Valarosa - Sweet cottage,Valarosa is on Avenida Chechem in a popular re...,Bobbi,19/05/11,"Belize City, Belize",within an hour,100%,100%,...,15.0,0.0,4.59,4.72,4.58,19.0,19.0,0.0,0.0,0.79
3,3,city scrape,"Toucan-kit, ac, wifi, bike, tropical gardens, ...","LOVE this area of Placencia; quiet, day & nigh...",Jacqueline Ann,23/06/11,"Placencia, Belize",within an hour,100%,100%,...,2.0,0.0,4.86,4.89,4.86,5.0,4.0,0.0,0.0,0.46
4,4,city scrape,rent a private room in a house,"Quiet neighbourhood, close to shopping centre,...",Jennifer,23/06/11,"Belmopan, Belize",a few days or more,0%,17%,...,1.0,0.0,4.58,4.92,4.83,1.0,0.0,1.0,0.0,0.11


In [5]:
# Convertimos variables a formato dicotómico si es necesario
# Como ejemplo, creamos variables binarias de algunas columnas

base['host_is_superhost_bin'] = (base['host_is_superhost'] == 't').astype(int)
base['instant_bookable_bin'] = (base['instant_bookable'] == 't').astype(int)
base['room_type_bin'] = (base['room_type'] == 'Entire home/apt').astype(int)
base['superhost_response_bin'] = (base['host_response_time'] == 'within an hour').astype(int)

# Limpiar la columna 'price' eliminando el signo de dólar y convirtiendo a número
base['price'] = base['price'].replace('[\$,]', '', regex=True).astype(float)

  base['price'] = base['price'].replace('[\$,]', '', regex=True).astype(float)


In [6]:
# Definimos 10 combinaciones de variables para analizar
# Variables dependientes y sus variables independientes

casos = [
    ('host_is_superhost_bin', ['host_total_listings_count', 'accommodates']),
    ('instant_bookable_bin', ['price', 'accommodates']),
    ('room_type_bin', ['price', 'accommodates']),
    ('host_is_superhost_bin', ['reviews_per_month', 'availability_365']),
    ('instant_bookable_bin', ['number_of_reviews', 'availability_365']),
    ('superhost_response_bin', ['host_total_listings_count', 'accommodates']),
    ('room_type_bin', ['number_of_reviews', 'availability_365']),
    ('host_is_superhost_bin', ['review_scores_rating', 'price']),
    ('instant_bookable_bin', ['review_scores_rating', 'price']),
    ('room_type_bin', ['reviews_per_month', 'price'])
]

In [7]:
# Lista para guardar resultados
resultados = []

# Análisis de los 10 casos
for i, (dependiente, independientes) in enumerate(casos):
    X = base[independientes]
    y = base[dependiente]
    
    # Separar datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)
    
    # Escalar datos
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Crear modelo
    modelo = LogisticRegression()
    modelo.fit(X_train, y_train)
    
    # Predicciones
    y_pred = modelo.predict(X_test)
    
    # Calcular métricas
    precision = precision_score(y_test, y_pred)
    exactitud = accuracy_score(y_test, y_pred)
    sensibilidad = recall_score(y_test, y_pred)
    
    # Guardar resultados
    resultados.append({
        'Caso': i + 1,
        'Dependiente': dependiente,
        'Independientes': ', '.join(independientes),
        'Precisión': precision,
        'Exactitud': exactitud,
        'Sensibilidad': sensibilidad
    })

In [8]:
# Crear DataFrame de resultados
tabla_resultados = pd.DataFrame(resultados)

In [9]:
# Mostrar resultados
tabla_resultados

Unnamed: 0,Caso,Dependiente,Independientes,Precisión,Exactitud,Sensibilidad
0,1,host_is_superhost_bin,"host_total_listings_count, accommodates",0.66,0.626984,0.3
1,2,instant_bookable_bin,"price, accommodates",0.671627,0.671627,1.0
2,3,room_type_bin,"price, accommodates",0.727728,0.728175,0.997257
3,4,host_is_superhost_bin,"reviews_per_month, availability_365",0.572438,0.598214,0.363229
4,5,instant_bookable_bin,"number_of_reviews, availability_365",0.66369,0.66369,1.0
5,6,superhost_response_bin,"host_total_listings_count, accommodates",0.872024,0.872024,1.0
6,7,room_type_bin,"number_of_reviews, availability_365",0.696429,0.696429,1.0
7,8,host_is_superhost_bin,"review_scores_rating, price",0.598592,0.560516,0.180467
8,9,instant_bookable_bin,"review_scores_rating, price",0.684524,0.684524,1.0
9,10,room_type_bin,"reviews_per_month, price",0.713294,0.713294,1.0
