In [24]:
"""
Titanic Data Analysis and JSON Export
Author: [Cindy Lund]
Description: Analyze Titanic passenger data, engineer features, and export to JSON
"""

import pandas as pd
import numpy as np
import json
from pathlib import Path

# Set up paths
DATA_DIR = Path("data")
CSV_FILE = DATA_DIR / "titanic.csv"
JSON_FILE = DATA_DIR / "titanic_data.json"

# Create data directory if it doesn't exist
DATA_DIR.mkdir(exist_ok=True)

print("Project setup complete!")
print(f"Data directory: {DATA_DIR}")
print(f"CSV file location: {CSV_FILE}")





Project setup complete!
Data directory: data
CSV file location: data\titanic.csv


In [26]:

df = pd.read_csv(CSV_FILE)

print(f"Dataset loaded successfully! Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())


Dataset loaded successfully! Shape: (891, 12)

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First few rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 175

In [27]:
# Calculating Descriptive Statistics


# Select numeric columns only
numeric_columns = df.select_dtypes(include=['number'])

# Calculate statistics (.mean, median, std)
# Calculate mean
mean_values = numeric_columns.mean()

# Calculate median
median_values = numeric_columns.median()

# Calculate standard deviation
std_values = numeric_columns.std()

print(numeric_columns.agg(['mean', 'median', 'std']))
# print(mean_values, median_values, std_values)


        PassengerId  Survived    Pclass        Age     SibSp     Parch  \
mean     446.000000  0.383838  2.308642  29.699118  0.523008  0.381594   
median   446.000000  0.000000  3.000000  28.000000  0.000000  0.000000   
std      257.353842  0.486592  0.836071  14.526497  1.102743  0.806057   

             Fare  
mean    32.204208  
median  14.454200  
std     49.693429  


In [28]:
# Count missing values
print("\n" + "="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)

missing_data = {}

for col in df.columns:
    missing_count = df[col].isna().sum()
    missing_percent = (missing_count / len(df)) * 100

    missing_data[col] = {
        "missing_count": missing_count,
        "missing_percent": missing_percent
    }
# Create a DataFrame for better visualization   
missing_df = pd.DataFrame(missing_data).T
# 3. Sort and inspect
missing_df = missing_df.sort_values(
    by="missing_count", 
    ascending=False
)

print(missing_df)




MISSING VALUES ANALYSIS
             missing_count  missing_percent
Cabin                687.0        77.104377
Age                  177.0        19.865320
Embarked               2.0         0.224467
PassengerId            0.0         0.000000
Name                   0.0         0.000000
Pclass                 0.0         0.000000
Survived               0.0         0.000000
Sex                    0.0         0.000000
Parch                  0.0         0.000000
SibSp                  0.0         0.000000
Fare                   0.0         0.000000
Ticket                 0.0         0.000000


In [29]:
# Create a copy of the dataframe for feature engineering
df_features = df.copy()

# Feature 1: Family Size
df_features['FamilySize'] = df_features['SibSp'] + df_features['Parch'] + 1
print(df_features[['SibSp', 'Parch', 'FamilySize']].head(10))

# Feature 2: Is Alone
df_features['IsAlone'] = (df_features['FamilySize'] == 1).astype(int)
print(df_features[['FamilySize', 'IsAlone']].head(10))

# Feature 3: Age Groups
def categorize_age(age):
    """Categorize age into groups"""
    if pd.isna(age):
        return 'Unknown'
    elif age < 18:
        return 'Child'
    elif age < 30:
        return 'Young Adult'
    elif age < 50:
        return 'Middle Age Adult'
    else:
        return 'Senior'

df_features['AgeGroup'] = df_features['Age'].apply(categorize_age)
print(df_features[['Age', 'AgeGroup']].head(10))

# Analyze feature differences between survivors and non-survivors 
print("\n" + "="*50)
print("FEATURE ANALYSIS: SURVIVED vs NOT SURVIVED")
print("="*50)

# Family Size by Survival - Do survivors tend to have larger or smaller families?
print("\nFamily Size by Survival:")
family_survival = df_features.groupby('Survived')['FamilySize'].agg(['mean', 'median', 'std'])
print(family_survival)

# Feature Differentiation Analysis: Statistical test: Do these features help differentiate?
print("\n" + "="*50)
print("FEATURE DIFFERENTIATION ANALYSIS")
print("="*50)

survived = df_features[df_features['Survived'] == 1]
not_survived = df_features[df_features['Survived'] == 0]

print("\nFamily Size:")
print(f"  Survived mean: {survived['FamilySize'].mean():.2f}")
print(f"  Not Survived mean: {not_survived['FamilySize'].mean():.2f}")
print(f"  Difference: {abs(survived['FamilySize'].mean() - not_survived['FamilySize'].mean()):.2f}")


   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1
5      0      0           1
6      0      0           1
7      3      1           5
8      0      2           3
9      1      0           2
   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1
5           1        1
6           1        1
7           5        0
8           3        0
9           2        0
    Age          AgeGroup
0  22.0       Young Adult
1  38.0  Middle Age Adult
2  26.0       Young Adult
3  35.0  Middle Age Adult
4  35.0  Middle Age Adult
5   NaN           Unknown
6  54.0            Senior
7   2.0             Child
8  27.0       Young Adult
9  14.0             Child

FEATURE ANALYSIS: SURVIVED vs NOT SURVIVED

Family Size by Survival:
              mean  median       std
Survived                            
0        

In [None]:
import json
from datetime import datetime
import pandas as pd
#Attempt 23:36 on 2024-06-05

# Step 3: Create Classes for JSON Export

class Passenger:
    """
    Represents a passenger with all their information.
    """
    def __init__(self, passenger_id, name, age, sex, survived, pclass,
                 fare, embarked=None, family_size=None, is_alone=None, title=None):

        self.passenger_id = int(passenger_id) if pd.notna(passenger_id) else None
        self.name = str(name) if pd.notna(name) else None
        self.age = float(age) if pd.notna(age) else None
        self.sex = str(sex) if pd.notna(sex) else None
        self.survived = int(survived) if pd.notna(survived) else None
        self.pclass = int(pclass) if pd.notna(pclass) else None
        self.fare = float(fare) if pd.notna(fare) else None
        self.embarked = str(embarked) if pd.notna(embarked) else None
        self.family_size = int(family_size) if pd.notna(family_size) else None

        if pd.notna(is_alone):
            self.is_alone = bool(int(is_alone)) if str(is_alone).isdigit() else bool(is_alone)
        else:
            self.is_alone = None

        self.title = str(title) if pd.notna(title) else None

    def to_dict(self):
        """Convert passenger to dictionary for JSON serialization."""
        return {
            'passenger_id': self.passenger_id,
            'name': self.name,
            'age': self.age,
            'sex': self.sex,
            'survived': self.survived,
            'pclass': self.pclass,
            'fare': self.fare,
            'embarked': self.embarked,
            'family_size': self.family_size,
            'is_alone': self.is_alone,
            'title': self.title,
        }


class TitanicDataset:
    """
    Represents the entire Titanic dataset with methods for JSON export.
    """
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.passengers = []
        self._create_passengers()

    def _create_passengers(self):
        """Create Passenger objects from dataframe."""
        for idx, row in self.dataframe.iterrows():
            passenger = Passenger(
                passenger_id=row.get('PassengerId', idx),
                name=row.get('Name'),
                age=row.get('Age'),
                sex=row.get('Sex'),
                survived=row.get('Survived'),
                pclass=row.get('Pclass'),
                fare=row.get('Fare'),
                embarked=row.get('Embarked'),
                family_size=row.get('FamilySize'),  # ✅ updated
                is_alone=row.get('IsAlone'),          # ✅ updated
                title=row.get('Title')
            )
            self.passengers.append(passenger)
    
    def analyze_missing_values(self):
        """Return missing value counts per column."""
        return self.dataframe.isna().sum().to_dict()

    def to_json(self, filename='titanic_data.json'):
        """Export dataset to JSON file."""
        survival_rate = (
            float(self.dataframe['Survived'].mean())
            if 'Survived' in self.dataframe.columns
            else None
        )

        data = {
            'metadata': {
                'dataset_name': 'Titanic Passenger Dataset',
                'export_date': datetime.now().isoformat(),
                'total_passengers': len(self.passengers),
                'survival_rate': survival_rate,
                'columns': list(self.dataframe.columns),
                'missing_values_by_column': self.analyze_missing_values(),

            },
            'passengers': [p.to_dict() for p in self.passengers]
        }

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        print(f"Data exported to {filename}")
        return data

    def get_summary_stats(self):
        """Get summary statistics."""
        total = len(self.passengers)

        survived = sum(1 for p in self.passengers if p.survived == 1)
        not_survived = sum(1 for p in self.passengers if p.survived == 0)

        ages = [p.age for p in self.passengers if p.age is not None]
        fares = [p.fare for p in self.passengers if p.fare is not None]

        return {
            'total_passengers': total,
            'survived': survived,
            'did_not_survive': not_survived,
            'survival_rate': survived / total if total else None,
            'average_age': sum(ages) / len(ages) if ages else None,
            'average_fare': sum(fares) / len(fares) if fares else None,
        }


# ---- Run & Checkpoint ----
# if 'df_engineered' in locals() and not df_engineered.empty:
if 'df_features' in locals() and not df_features.empty:
    # dataset = TitanicDataset(df_engineered)
    dataset = TitanicDataset(df_features)
    
    print("Dataset loaded successfully.")
    print(f"Passengers: {len(dataset.passengers)}")

    stats = dataset.get_summary_stats()
    print("\nSummary statistics:")
    for key, value in stats.items():
        print(f"{key}: {value}")

    filename = 'titanic_data.json'
    dataset.to_json(filename)

    with open(filename, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        print(f"\nLoaded JSON data with {len(json_data['passengers'])} passengers.")
        print(f"JSON Metadata: {json_data['metadata']}")
        print(f"First passenger in JSON: {json_data['passengers'][0]}")
        print(f"Last passenger in JSON: {json_data['passengers'][-1]}")
        
        assert len(json_data['passengers']) == len(dataset.passengers), "Passenger count mismatch!"
        print('verification passed.')

    print("\nJSON validation: PASSED ✅")
else:
    print("df_features not found or empty.") # df_engineered


Dataset loaded successfully.
Passengers: 891

Summary statistics:
total_passengers: 891
survived: 342
did_not_survive: 549
survival_rate: 0.3838383838383838
average_age: 29.69911764705882
average_fare: 32.204207968574636
Data exported to titanic_data.json

Loaded JSON data with 891 passengers.
JSON Metadata: {'dataset_name': 'Titanic Passenger Dataset', 'export_date': '2026-02-05T00:43:35.285652', 'total_passengers': 891, 'survival_rate': 0.3838383838383838, 'columns': ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone', 'AgeGroup'], 'missing_values_by_column': {'PassengerId': 0, 'Survived': 0, 'Pclass': 0, 'Name': 0, 'Sex': 0, 'Age': 177, 'SibSp': 0, 'Parch': 0, 'Ticket': 0, 'Fare': 0, 'Cabin': 687, 'Embarked': 2, 'FamilySize': 0, 'IsAlone': 0, 'AgeGroup': 0}}
First passenger in JSON: {'passenger_id': 1, 'name': 'Braund, Mr. Owen Harris', 'age': 22.0, 'sex': 'male', 'survived': 0, 'pclass': 3, 'fa