<a href="https://colab.research.google.com/github/CaroliCosas/Bootcamp_Data_Science/blob/main/Mod2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

In [23]:
# Cargar datos
def cargar_datos(filepath):
    data = np.genfromtxt(filepath, delimiter=",", dtype=None, encoding="utf-8", skip_header=1)
    price = data[:, 0].astype(float)
    bedrooms = data[:, 1].astype(float)
    bathrooms = data[:, 2].astype(float)
    m2 = data[:, 3].astype(float)
    addresses = np.array([addr.replace(", Madrid", "") for addr in data[:, 4]])
    return price, bedrooms, bathrooms, m2, addresses

In [24]:
# Estadística básica
def estadistica_basica(price):
    mean_price = np.mean(price)
    median_price = np.median(price)
    max_price = np.max(price)
    min_price = np.min(price)
    print(f"Media: {mean_price}, Mediana: {median_price}, Máximo: {max_price}, Mínimo: {min_price}")

In [25]:
# Histograma y curva de densidad
def histograma_curva_densidad(price):
    plt.figure(figsize=(10, 6))
    sns.histplot(price, kde=True, bins=30, color="blue", alpha=0.6)
    plt.title("Histograma y Curva de Densidad de Precios")
    plt.xlabel("Precio (€)")
    plt.ylabel("Frecuencia")
    plt.grid()
    plt.show()

In [26]:
# Cuartiles e IQR
def cuartiles_iqr(price):
    q1 = np.percentile(price, 25)
    q2 = np.percentile(price, 50)
    q3 = np.percentile(price, 75)
    iqr = q3 - q1
    print(f"Q1: {q1}, Q2: {q2}, Q3: {q3}, IQR: {iqr}")
    return q1, q3, iqr

In [27]:
# Filtrar por porcentaje
def filtrar_por_porcentaje(price):
    price_sorted = np.sort(price)
    top_20_percent = price_sorted[int(len(price) * 0.8):]
    bottom_20_percent = price_sorted[:int(len(price) * 0.2)]
    print(f"Top 20% más caro: {len(top_20_percent)}, Bottom 20% más barato: {len(bottom_20_percent)}")
    return top_20_percent, bottom_20_percent

In [28]:
# Moda
def moda(addresses):
    address_mode = Counter(addresses).most_common(1)
    print(f"Moda de las direcciones: {address_mode}")

In [29]:
# Varianza y desviación estándar
def varianza_desviacion(price):
    variance = np.var(price)
    std_dev = np.std(price)
    print(f"Varianza: {variance}, Desviación estándar: {std_dev}")
    return variance, std_dev

In [30]:
# Detectar outliers con Tukey
def filtro_outliers_tukey(price, q1, q3, iqr):
    upper_limit = q3 + 1.5 * iqr
    lower_limit = q1 - 1.5 * iqr
    outliers = price[(price > upper_limit) | (price < lower_limit)]
    print(f"Outliers detectados: {len(outliers)}")
    return upper_limit, lower_limit, outliers

In [31]:
# Graficar outliers
def graficar_outliers(price, upper_limit, lower_limit):
    plt.figure(figsize=(10, 6))
    sns.histplot(price, bins=30, color="blue", alpha=0.6, kde=True)
    plt.axvline(upper_limit, color='red', linestyle='dashed', linewidth=1, label="Límite Superior")
    plt.axvline(lower_limit, color='blue', linestyle='dashed', linewidth=1, label="Límite Inferior")
    plt.legend()
    plt.title("Outliers con límites marcados")
    plt.grid()
    plt.show()

In [32]:
# Matriz de correlación
def matriz_correlacion(price, bedrooms, bathrooms, m2):
    data_matrix = np.corrcoef([price, bedrooms, bathrooms, m2])
    sns.heatmap(data_matrix, annot=True, xticklabels=["price", "bedrooms", "bathrooms", "m2"], yticklabels=["price", "bedrooms", "bathrooms", "m2"])
    plt.title("Matriz de Correlación")
    plt.show()

In [33]:
# Estandarizar
def estandarizar(price):
    mean_price = np.mean(price)
    std_dev_price = np.std(price)
    price_std = (price - mean_price) / std_dev_price
    return price_std

In [34]:
# Asimetría y curtosis
def asimetria_curtosis(price):
    skewness = stats.skew(price)
    kurtosis = stats.kurtosis(price)
    print(f"Asimetría: {skewness}, Curtosis: {kurtosis}")

In [35]:
# Transformar distribuciones
def transformar_distribuciones(price):
    price_log = np.log(price[price > 0])
    plt.figure(figsize=(10, 6))
    sns.histplot(price_log, kde=True, bins=30, color="green", alpha=0.6)
    plt.title("Transformación Logarítmica de Precios")
    plt.xlabel("Log(Precio)")
    plt.ylabel("Frecuencia")
    plt.grid()
    plt.show()

In [36]:
# Contraste de hipótesis
def contraste_hipotesis_barrio(price, addresses, barrio_x, barrio_y):
    barrio_x_prices = price[addresses == barrio_x]
    barrio_y_prices = price[addresses == barrio_y]
    t_stat, p_value = stats.ttest_ind(barrio_x_prices, barrio_y_prices, equal_var=False)
    print(f"T-stat: {t_stat}, P-value: {p_value}")

def contraste_hipotesis_baños(price, bathrooms):
    three_bathrooms = price[bathrooms >= 3]
    one_two_bathrooms = price[(bathrooms == 1) | (bathrooms == 2)]
    t_stat, p_value = stats.ttest_ind(three_bathrooms, one_two_bathrooms, equal_var=False)
    print(f"T-stat: {t_stat}, P-value: {p_value}")

In [37]:
# Carga del archivo y ejecución de análisis
filepath = "madrid_idealista.csv"
price, bedrooms, bathrooms, m2, addresses = cargar_datos(filepath)
estadistica_basica(price)
histograma_curva_densidad(price)
q1, q3, iqr = cuartiles_iqr(price)
filtrar_por_porcentaje(price)
moda(addresses)
variance, std_dev = varianza_desviacion(price)
upper_limit, lower_limit, outliers = filtro_outliers_tukey(price, q1, q3, iqr)
graficar_outliers(price, upper_limit, lower_limit)
matriz_correlacion(price, bedrooms, bathrooms, m2)
price_std = estandarizar(price)
asimetria_curtosis(price)
transformar_distribuciones(price)

FileNotFoundError: madrid_idealista.csv not found.