# Titanic — Data Overview

**Purpose:** load the datasets, run sanity checks, create a short metadata summary and save it.  
Outcome: a clear list of missing values, data types, basic anomalies, and a short action list for EDA.


In [2]:
# Setup 

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shutil

sns.set(style="whitegrid")
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 200)

In [20]:
data_dir = "../data"   
train = pd.read_csv(os.path.join(data_dir, "train.csv"))
test  = pd.read_csv(os.path.join(data_dir, "test.csv"))
gender_sub = pd.read_csv(os.path.join(data_dir, "gender_submission.csv"))

print("Train shape:", train.shape)
print("Test shape :", test.shape)
print("Gender sub:", gender_sub.shape)


Train shape: (891, 12)
Test shape : (418, 11)
Gender sub: (418, 2)


In [9]:
# a quick look at the top and bottom
display(train.head(5))
display(train.tail(3))

# quick information
print("\nTrain info:")
train.info()
print("\nTrain describe (numerical):")
display(train.describe().T)
print("\nTrain describe (object):")
display(train.describe(include='object').T)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q



Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Train describe (numerical):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292



Train describe (object):


Unnamed: 0,count,unique,top,freq
Name,891,891,"Dooley, Mr. Patrick",1
Sex,891,2,male,577
Ticket,891,681,347082,7
Cabin,204,147,G6,4
Embarked,889,3,S,644


In [10]:
def missing_table(df):
    miss_count = df.isnull().sum()
    miss_pct = (miss_count / len(df)) * 100
    return pd.concat([miss_count, miss_pct], axis=1, keys=['missing_count','percent_missing']).sort_values('missing_count', ascending=False)

print("Train missing values:")
display(missing_table(train))

print("\nTest missing values:")
display(missing_table(test))


Train missing values:


Unnamed: 0,missing_count,percent_missing
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
PassengerId,0,0.0
Name,0,0.0
Pclass,0,0.0
Survived,0,0.0
Sex,0,0.0
Parch,0,0.0
SibSp,0,0.0



Test missing values:


Unnamed: 0,missing_count,percent_missing
Cabin,327,78.229665
Age,86,20.574163
Fare,1,0.239234
Name,0,0.0
Pclass,0,0.0
PassengerId,0,0.0
Sex,0,0.0
Parch,0,0.0
SibSp,0,0.0
Ticket,0,0.0


In [11]:
# Columns only in train / only in test
train_cols = set(train.columns)
test_cols  = set(test.columns)
print("Columns in train but not in test:", sorted(list(train_cols - test_cols)))
print("Columns in test but not in train:", sorted(list(test_cols - train_cols)))

# Target distribution
print("\nTarget distribution (train):")
display(train['Survived'].value_counts(dropna=False))
display(train['Survived'].value_counts(normalize=True).rename("proportion"))


Columns in train but not in test: ['Survived']
Columns in test but not in train: []

Target distribution (train):


Survived
0    549
1    342
Name: count, dtype: int64

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [12]:
# PassengerId uniqueness
print("Duplicate PassengerIds in train:", train['PassengerId'].duplicated().sum())
print("Duplicate PassengerIds in test :", test['PassengerId'].duplicated().sum())

# Any fully duplicated rows?
print("Fully duplicated rows in train:", train.duplicated().sum())
print("Fully duplicated rows in test :", test.duplicated().sum())


Duplicate PassengerIds in train: 0
Duplicate PassengerIds in test : 0
Fully duplicated rows in train: 0
Fully duplicated rows in test : 0


In [13]:
def build_metadata(df):
    meta = pd.DataFrame({
        'dtype': df.dtypes.astype(str),
        'n_non_null': df.count(),
        'n_missing': df.isnull().sum(),
        'pct_missing': (df.isnull().mean()*100).round(2),
        'n_unique': df.nunique()
    })
    # sample values (up to 5) for quick inspection
    samples = []
    for col in df.columns:
        vals = df[col].dropna().unique()[:5]
        samples.append(", ".join(map(str, vals)))
    meta['sample_values'] = samples
    return meta

train_meta = build_metadata(train)
display(train_meta)


Unnamed: 0,dtype,n_non_null,n_missing,pct_missing,n_unique,sample_values
PassengerId,int64,891,0,0.0,891,"1, 2, 3, 4, 5"
Survived,int64,891,0,0.0,2,"0, 1"
Pclass,int64,891,0,0.0,3,"3, 1, 2"
Name,object,891,0,0.0,891,"Braund, Mr. Owen Harris, Cumings, Mrs. John Br..."
Sex,object,891,0,0.0,2,"male, female"
Age,float64,714,177,19.87,88,"22.0, 38.0, 26.0, 35.0, 54.0"
SibSp,int64,891,0,0.0,7,"1, 0, 3, 4, 2"
Parch,int64,891,0,0.0,7,"0, 1, 2, 5, 3"
Ticket,object,891,0,0.0,681,"A/5 21171, PC 17599, STON/O2. 3101282, 113803,..."
Fare,float64,891,0,0.0,248,"7.25, 71.2833, 7.925, 53.1, 8.05"


In [14]:
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", num_cols)

for col in ['Age','Fare']:
    if col in train.columns:
        print(f"\n{col} stats:")
        print("  min:", train[col].min(), " max:", train[col].max(), " mean:", train[col].mean(), " median:", train[col].median())
        print("  zeros:", (train[col] == 0).sum(), " negative values:", (train[col] < 0).sum())

Numeric columns: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Age stats:
  min: 0.42  max: 80.0  mean: 29.69911764705882  median: 28.0
  zeros: 0  negative values: 0

Fare stats:
  min: 0.0  max: 512.3292  mean: 32.204207968574636  median: 14.4542
  zeros: 15  negative values: 0


In [15]:
# Example: rows with missing Age
if 'Age' in train.columns:
    print("\nRows with missing Age (train):", train['Age'].isnull().sum())
    display(train[train['Age'].isnull()].head())

# Example: rows with Cabin present (few)
if 'Cabin' in train.columns:
    print("\nRows with Cabin present (train):", train['Cabin'].notnull().sum())
    display(train[train['Cabin'].notnull()].head())


Rows with missing Age (train): 177


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q



Rows with Cabin present (train): 204


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [16]:
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_cols)

for col in ['Sex','Pclass','Embarked']:
    if col in train.columns:
        print(f"\n{col} value counts:")
        display(train[col].value_counts(dropna=False))

Categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

Sex value counts:


Sex
male      577
female    314
Name: count, dtype: int64


Pclass value counts:


Pclass
3    491
1    216
2    184
Name: count, dtype: int64


Embarked value counts:


Embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64

In [19]:
# make outputs folder
os.makedirs("../outputs", exist_ok=True)

# save metadata summary for future reference
train_meta.to_csv("../outputs/train_metadata.csv")

# optionally copy raw source files into data/raw for provenance
os.makedirs("data/raw", exist_ok=True)
shutil.copy(os.path.join(data_dir, "train.csv"), "data/raw/train.csv")
shutil.copy(os.path.join(data_dir, "test.csv"), "data/raw/test.csv")
shutil.copy(os.path.join(data_dir, "gender_submission.csv"), "data/raw/gender_submission.csv")

print("Saved metadata to outputs/train_metadata.csv and copied raw files to data/raw/")


Saved metadata to outputs/train_metadata.csv and copied raw files to data/raw/


### 📝 Quick Findings

- The dataset has **12 columns**.  
- **No duplicated rows** detected in the metadata.  
- **Data types**:  
  - Most columns are `int64` (numeric) or `object` (categorical/string).  
- **Missing values**:  
  - `Cabin`: ~77% missing.  
  - `Age`: ~19% missing.  
  - `Embarked`: ~0.2% missing.  
  - Other columns are complete.  
- **Unique values**:  
  - `PassengerId` is unique (identifier).  
  - `Sex`: {male, female}.  
  - `Embarked`: {S, C, Q}.  
  - `Pclass`: values {1, 2, 3}.  
  - `Name`, `Ticket`, and `Cabin` have many unique entries.  
- **Important notes**:  
  - Target column is **`Survived`** (0 = did not survive, 1 = survived).  
  - Columns like `Name`, `Ticket`, and `Cabin` may need feature engineering before modeling.  

