In [1]:
import pandas as pd
import scipy.stats as st
import numpy as np

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv")
df

Unnamed: 0,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,Mega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,Charmander,Fire,,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...
795,Diancie,Rock,Fairy,50,100,150,100,150,50,6,True
796,Mega Diancie,Rock,Fairy,50,160,110,160,110,110,6,True
797,Hoopa Confined,Psychic,Ghost,80,110,60,150,130,70,6,True
798,Hoopa Unbound,Psychic,Dark,80,160,60,170,130,80,6,True


Pokémon Data
Pregunta 1: Los Pokémon tipo Dragon tienen, en promedio, más HP que los tipo Grass.
Hipótesis:
H0 (hipótesis nula): μ_HP_Dragon ≤ μ_HP_Grass
H1 (hipótesis alternativa): μ_HP_Dragon > μ_HP_Grass
Prueba adecuada: Two Sample T-Test (one-tailed) para comparar medias de dos grupos independientes.

In [6]:
from scipy.stats import ttest_ind

dragon_hp = df[df['Type 1'] == 'Dragon']['HP']
grass_hp = df[df['Type 1'] == 'Grass']['HP']

# T-test sin 'alternative'
t_stat, p_value_two_tailed = ttest_ind(dragon_hp, grass_hp)

# Como queremos saber si Dragon tiene más HP (una cola):
if t_stat > 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

print("t-statistic:", t_stat)
print("p-value (one-tailed):", p_value_one_tailed)


# Si p-value < 0.05: rechazamos H0 → los Pokémon tipo Dragon tienen más HP.
# Si p-value ≥ 0.05: no hay evidencia suficiente para decir que tienen más HP.

t-statistic: 3.590444254130357
p-value (one-tailed): 0.0002567969150153481


# Los Pokémon legendarios tienen estadísticas diferentes a los no legendarios?
# T-Test para dos muestras independientes (Two-Sample T-Test, two-tailed)
# Hipótesis por cada variable:
H0: No hay diferencia entre legendarios y no legendarios.
H1: Sí hay una diferencia en la media.

In [7]:
legendary = df[df['Legendary'] == True]
non_legendary = df[df['Legendary'] == False]

stats = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
for stat in stats:
    t, p = st.ttest_ind(legendary[stat], non_legendary[stat])
    print(f"{stat}: p-value = {p}")


HP: p-value = 3.330647684846191e-15
Attack: p-value = 7.827253003205333e-24
Defense: p-value = 1.5842226094427255e-12
Sp. Atk: p-value = 6.314915770427266e-41
Sp. Def: p-value = 1.8439809580409333e-26
Speed: p-value = 2.3540754436897763e-21


# Dataset de California Housing
# ¿Las casas cercanas a una escuela o un hospital son más caras?
Hipótesis:
H0: El valor medio de casas cercanas es igual o menor al de casas lejanas.
H1: El valor medio de casas cercanas es mayor.

In [9]:
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [11]:
from scipy.stats import ttest_ind
from numpy.linalg import norm
import numpy as np

# Coordenadas
school = (-118, 34)
hospital = (-122, 37)

# Función de distancia euclidiana
def distance(coord1, coord2):
    return norm(np.array(coord1) - np.array(coord2))

# Agregamos columnas de proximidad
df['near_school'] = df.apply(lambda row: distance((row['longitude'], row['latitude']), school) < 0.5, axis=1)
df['near_hospital'] = df.apply(lambda row: distance((row['longitude'], row['latitude']), hospital) < 0.5, axis=1)
df['near_either'] = df['near_school'] | df['near_hospital']

# Grupos de comparación
close = df[df['near_either']]['median_house_value']
far = df[~df['near_either']]['median_house_value']

# T-test sin 'alternative'
t_stat, p_value_two_tailed = ttest_ind(close, far)

# Prueba de una cola (¿las casas cercanas son más caras?)
if t_stat > 0:
    p_value_one_tailed = p_value_two_tailed / 2
else:
    p_value_one_tailed = 1 - (p_value_two_tailed / 2)

print("t-statistic:", t_stat)
print("p-value (one-tailed):", p_value_one_tailed)



t-statistic: 38.04632342033554
p-value (one-tailed): 2.408917945663922e-304
