In [None]:
# Challenge 1: Pokemon Data
import pandas as pd
import scipy.stats as st
import numpy as np

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv")

# Show first few rows of the dataset
df.head()

# 1. We posit that Dragon-type Pokémon have more HP than Grass-type Pokémon.
# Filter the data for Dragon and Grass types
dragon_hp = df[df['Type 1'] == 'Dragon']['HP']
grass_hp = df[df['Type 1'] == 'Grass']['HP']

# Perform a t-test for independent samples
t_stat, p_val = st.ttest_ind(dragon_hp, grass_hp, alternative='greater')

# Display the results
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_val}")

# Interpretation of results at 5% significance
if p_val < 0.05:
    print("Reject the null hypothesis. Dragon-type Pokémon have more HP than Grass-type Pokémon.")
else:
    print("Fail to reject the null hypothesis. No significant difference in HP between Dragon and Grass Pokémon.")

# 2. We posit that Legendary Pokémon have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) compared to Non-Legendary Pokémon.

# Filter the data for Legendary and Non-Legendary Pokémon
legendary = df[df['Legendary'] == True]
non_legendary = df[df['Legendary'] == False]

# Perform a t-test for each stat (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed)
stats_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
for stat in stats_columns:
    legendary_stat = legendary[stat]
    non_legendary_stat = non_legendary[stat]

    t_stat, p_val = st.ttest_ind(legendary_stat, non_legendary_stat)

    print(f"Stat: {stat}")
    print(f"T-Statistic: {t_stat}")
    print(f"P-Value: {p_val}")

    if p_val < 0.05:
        print(f"Reject the null hypothesis. There is a significant difference in {stat} between Legendary and Non-Legendary Pokémon.\n")
    else:
        print(f"Fail to reject the null hypothesis. No significant difference in {stat} between Legendary and Non-Legendary Pokémon.\n")


In [None]:
# Challenge 2: California Housing Data
import pandas as pd
import numpy as np
import scipy.stats as st

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv")

# Show the first few rows
df.head()

# School and Hospital coordinates
school_coords = (-118, 34)
hospital_coords = (-122, 37)

# Function to calculate Euclidean distance
def euclidean_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)

# Calculate the distance to school and hospital
df['distance_to_school'] = df.apply(lambda row: euclidean_distance(row['latitude'], row['longitude'], school_coords[1], school_coords[0]), axis=1)
df['distance_to_hospital'] = df.apply(lambda row: euclidean_distance(row['latitude'], row['longitude'], hospital_coords[1], hospital_coords[0]), axis=1)

# Determine if a house is close to a school or hospital (distance < 0.50)
df['close_to_school_or_hospital'] = (df['distance_to_school'] < 0.50) | (df['distance_to_hospital'] < 0.50)

# Separate the dataset into two groups: close to school/hospital and far from school/hospital
close = df[df['close_to_school_or_hospital'] == True]
far = df[df['close_to_school_or_hospital'] == False]

# Perform a t-test to compare the median house values
t_stat, p_val = st.ttest_ind(close['median_house_value'], far['median_house_value'])

# Display the results
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_val}")

# Interpretation of results at 5% significance
if p_val < 0.05:
    print("Reject the null hypothesis. Houses close to a school or hospital have a different median house value than those farther away.")
else:
    print("Fail to reject the null hypothesis. No significant difference in median house values between houses close and far from a school or hospital.")
