# 📓 Data Science Homework
Fill in the code blocks and markdown cells as instructed below.

## Descriptive Statistics

In [None]:
# TODO: Load the dataset and compute mean, median, mode, std, IQR
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('../data/ACCIDENTS_GU_BCN_2013.csv')

# Print column names to see exact labels (optional if you've already checked)
# print(df.columns)

# Use the correct column name as it appears in your file
victims = df['N�mero de v�ctimes']  # Update this if your column is different!

# Compute descriptive statistics
print("Mean:", victims.mean())
print("Median:", victims.median())
print("Mode:", victims.mode().values)
print("Standard Deviation:", victims.std())

iqr = victims.quantile(0.75) - victims.quantile(0.25)
print("IQR:", iqr)

# In this section we calculated different statistics for a number of victims.
# These statistics include the mean, median, mode, standard deviation, and interquartile range

Mean: 1.2070424024532094
Median: 1.0
Mode: [1]
Standard Deviation: 0.8005795130298843
IQR: 0.0


## Visualizations (Histogram, KDE, Boxplot, Heatmap)

In [None]:
# TODO: Plot histogram, KDE, boxplot, and correlation heatmap
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram
sns.histplot(victims, bins=15, kde=False)
plt.title("Histogram of Victims")
plt.xlabel("Number of Victims")
plt.ylabel("Frequency")
plt.show()

# KDE Plot
sns.kdeplot(victims.dropna(), fill=True)
plt.title("KDE Plot of Victims")
plt.xlabel("Number of Victims")
plt.show()

# Boxplot
sns.boxplot(x=victims)
plt.title("Boxplot of Victims")
plt.xlabel("Number of Victims")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# These different visualizations help us understand the data distributions in different ways
# These include a histogram, KDE Plot, Boxplot, and a correlation heatmap.

## Bootstrap Confidence Interval

In [None]:
# TODO: Bootstrap sample mean and CI (e.g., for 'Número de víctimes')
import numpy as np

# Make sure we're using the same victims column
# If you're running this in a fresh session, make sure df is already loaded

# Drop missing values (important for clean bootstrap sampling)
victims_clean = victims.dropna()

# Create 1000 bootstrap samples and calculate the mean of each
boot_means = [victims_clean.sample(frac=1, replace=True).mean() for _ in range(1000)]

# Calculate the 95% confidence interval from the bootstrapped means
ci_lower = np.percentile(boot_means, 2.5)
ci_upper = np.percentile(boot_means, 97.5)

print(f'95% Confidence Interval for mean number of victims: ({ci_lower:.2f}, {ci_upper:.2f})')

#Using bootstrap sampling, 1000 random samples of data are made and then the confidence interval is calculated


## Hypothesis Testing

In [None]:
# TODO: Perform a t-test (e.g., is mean != 1?) and interpret the result
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp

# Load the dataset
df = pd.read_csv('../data/ACCIDENTS_GU_BCN_2013.csv')

# Clean the column for victims
victims = df['N�mero de v�ctimes']  # Make sure this is the correct column name

# Clean the data by removing any missing values
victims_clean = victims.dropna()

# Bootstrap Sampling
boot_means = [victims_clean.sample(frac=1, replace=True).mean() for _ in range(1000)]

# Calculate the 95% confidence interval from the bootstrapped means
ci_lower = np.percentile(boot_means, 2.5)
ci_upper = np.percentile(boot_means, 97.5)

print(f'95% Confidence Interval for mean number of victims: ({ci_lower:.2f}, {ci_upper:.2f})')

# Hypothesis Testing: One-sample t-test
t_stat, p_val = ttest_1samp(victims_clean, popmean=1)

print(f"t-statistic: {t_stat:.2f}")
print(f"p-value: {p_val:.4f}")

if p_val < 0.05:
    print("Reject the null hypothesis: the mean is significantly different from 1.")
else:
    print("Fail to reject the null hypothesis: the mean is not significantly different from 1.")

# This performs a one sample t-test to test whether the mean number of victims is significantly different from 1.

95% Confidence Interval for mean number of victims: (1.19, 1.22)
t-statistic: 25.15
p-value: 0.0000
Reject the null hypothesis: the mean is significantly different from 1.
