In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler

# Ignore useless warnings
import warnings

warnings.filterwarnings('ignore')

from scripts.load_data import load_ufo_data
from scripts.clean_data import clean_columns, preprocess_datetime, standardize_values

df = load_ufo_data("../data/scrubbed.csv")
df = clean_columns(df)
df = preprocess_datetime(df)
df = standardize_values(df)

In [3]:
df_cor = df[["City", "State", "Country", "Shape"]].dropna()
print(df_cor.info())

<class 'pandas.core.frame.DataFrame'>
Index: 66524 entries, 0 to 80331
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   City     66524 non-null  object
 1   State    66524 non-null  object
 2   Country  66524 non-null  object
 3   Shape    66524 non-null  object
dtypes: object(4)
memory usage: 2.5+ MB
None


In [4]:
cont_city_shape = pd.crosstab(df_cor["City"], df_cor["Shape"])
cont_country_shape = pd.crosstab(df_cor["Country"], df_cor["Shape"])

In [6]:
chi1, p1, dof1, _ = stats.chi2_contingency(cont_city_shape)
chi2, p2, dof2, _ = stats.chi2_contingency(cont_country_shape)

print(f"City vs Shape → χ²={chi1:.2f}, p={p1:.4f}, dof={dof1}")
print(f"Country vs Shape → χ²={chi2:.2f}, p={p2:.4f}, dof={dof2}")

City vs Shape → χ²=265824.51, p=1.0000, dof=298053
Country vs Shape → χ²=57.29, p=0.9788, dof=81


In [7]:
chi1_ll, p1_ll, *_ = stats.chi2_contingency(cont_city_shape, lambda_="log-likelihood")
chi2_ll, p2_ll, *_ = stats.chi2_contingency(cont_country_shape, lambda_="log-likelihood")

print(f"City vs Shape (Log-Likelihood) → p={p1_ll:.4f}")
print(f"Country vs Shape (Log-Likelihood) → p={p2_ll:.4f}")

City vs Shape (Log-Likelihood) → p=1.0000
Country vs Shape (Log-Likelihood) → p=0.9902


In [8]:
def cramers_v(conf_matrix, chi_stat):
    n = conf_matrix.sum().sum()
    min_dim = min(conf_matrix.shape) - 1
    return np.sqrt(chi_stat / (n * min_dim))

cv_city = cramers_v(cont_city_shape, chi1)
cv_country = cramers_v(cont_country_shape, chi2)

print(f"Cramér’s V (City vs Shape): {cv_city:.4f}")
print(f"Cramér’s V (Country vs Shape): {cv_country:.4f}")

Cramér’s V (City vs Shape): 0.3847
Cramér’s V (Country vs Shape): 0.0169


In [9]:
scaler = MinMaxScaler()

In [10]:
scaled_city = cont_city_shape.copy()
scaled_city.iloc[:, :] = scaler.fit_transform(scaled_city)

scaled_country = cont_country_shape.copy()
scaled_country.iloc[:, :] = scaler.fit_transform(scaled_country)

In [11]:
scaled_country += 1e-10

In [12]:
chi3, p3, dof3, _ = stats.chi2_contingency(scaled_city)
chi4, p4, dof4, _ = stats.chi2_contingency(scaled_country)

print(f"Scaled City vs Shape → χ²={chi3:.2f}, p={p3:.4f}, dof={dof3}")
print(f"Scaled Country vs Shape → χ²={chi4:.2f}, p={p4:.4f}, dof={dof4}")

Scaled City vs Shape → χ²=19797.50, p=1.0000, dof=298053
Scaled Country vs Shape → χ²=0.64, p=1.0000, dof=81


- A chi-square test found **no statistically significant association** between UFO shape and either city or country (p ≈ 1.0).
- The large number of unique cities creates a **sparse matrix**, likely invalidating the test despite a large χ².
- **Cramér’s V** suggests a **weak or negligible effect size**, especially for country-level analysis.
- Even with Min-Max scaling applied, results did not improve, confirming no meaningful correlation.

📌 **Conclusion:** There is **no strong evidence** to support that reported UFO shapes vary meaningfully by city or country.