# 🧼 Newborn Screening Data Analysis - Jupyter Notebook
This notebook covers data cleaning, exploration, visualization, and regression modeling on newborn screening data.

## 🔹 Step 1: Primary Cleaning

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Load data
df = pd.read_excel("newborn.xlsx", sheet_name="source")

# Select and clean relevant columns
columns_to_keep = [
    'California Region',
    'Disease Type',
    'Case\nCount',
    'Number\nScreened',
    'Percent\nof All\nDisorders\nin Region'
]
df_main = df[columns_to_keep].copy()
df_main.columns = (
    df_main.columns.str.replace(r'\n', ' ', regex=True)
                   .str.strip()
                   .str.lower()
                   .str.replace(r'\s+', '_', regex=True)
)
df_main.head()


## 🔍 Step 2: Missing Values and Data Types

In [None]:

df_main.isnull().sum(), df_main.dtypes


## ⚠️ Step 3: Outlier Detection

In [None]:

def find_outliers_iqr(data, column):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

outlier_rows = pd.DataFrame()
for col in ['case_count', 'number_screened', 'percent_of_all_disorders_in_region']:
    outliers = find_outliers_iqr(df_main, col)
    if not outliers.empty:
        outliers = outliers.copy()
        outliers['outlier_column'] = col
        outlier_rows = pd.concat([outlier_rows, outliers], ignore_index=True)
outlier_rows


## 🔢 Step 4: Unique Value Checks

In [None]:

df_main['california_region'].nunique(), df_main['disease_type'].nunique()


## 📊 Step 5: Descriptive Statistics

In [None]:

df_main.describe().T


## 📈 Step 6: Visualizations

In [None]:

plt.figure(figsize=(10, 6))
sns.barplot(data=df_main, x="california_region", y="case_count", estimator=sum)
plt.xticks(rotation=45)
plt.title("Total Case Count by Region")
plt.tight_layout()
plt.show()

sns.barplot(data=df_main, x="california_region", y="percent_of_all_disorders_in_region", estimator=np.mean)
plt.xticks(rotation=45)
plt.title("Average Disorder % by Region")
plt.tight_layout()
plt.show()

sns.scatterplot(data=df_main, x="number_screened", y="case_count", hue="california_region")
plt.title("Number Screened vs Case Count")
plt.tight_layout()
plt.show()


## 📋 Step 7: Grouping & Categorization

In [None]:

df_main.groupby('california_region')['disease_type'].nunique()
df_main.groupby('disease_type')['california_region'].nunique()


## 📉 Step 8: Simple Linear Regression

In [None]:

X = sm.add_constant(df_main['number_screened'])
y = df_main['case_count']
model = sm.OLS(y, X).fit()
model.summary()


## 📊 Step 9: Multiple Linear Regression

In [None]:

df_encoded = pd.get_dummies(df_main, columns=['california_region'], drop_first=True)
X_multi = sm.add_constant(df_encoded.drop(columns=['case_count', 'disease_type']))
y_multi = df_encoded['case_count']
multi_model = sm.OLS(y_multi, X_multi).fit()
multi_model.summary()
