In [13]:
# ==============================
# SECTION 1: ENVIRONMENT SETUP
# ==============================

import pandas as pd
import numpy as np

print("[INFO] Core libraries (pandas, numpy) imported successfully.")


[INFO] Core libraries (pandas, numpy) imported successfully.


In [14]:
# ==============================
# SECTION 2: DATASET LOADING
# ==============================

DATASET_NAME = "Titanic Dataset"
DATASET_PATH = "../Datasets/titanic.csv"

try:
    df = pd.read_csv(DATASET_PATH)
    print(f"[SUCCESS] {DATASET_NAME} loaded successfully from: {DATASET_PATH}")
    print("[VALIDATION] Dataset loaded into DataFrame with expected structure.")
except Exception as e:
    print("[ERROR] Failed to load dataset.")
    raise e


[SUCCESS] Titanic Dataset loaded successfully from: ../Datasets/titanic.csv
[VALIDATION] Dataset loaded into DataFrame with expected structure.


In [15]:
# ==============================
# SECTION 3: DATASET STRUCTURE
# ==============================

rows, cols = df.shape
print("[STRUCTURE] Dataset dimensionality:")
print(f"  ➤ Number of Rows    : {rows}")
print(f"  ➤ Number of Columns : {cols}")

print("[INTERPRETATION]")
print("- Each row represents a passenger.")
print("- Dataset size is sufficient for exploratory analysis and baseline supervised ML models.")


[STRUCTURE] Dataset dimensionality:
  ➤ Number of Rows    : 891
  ➤ Number of Columns : 12
[INTERPRETATION]
- Each row represents a passenger.
- Dataset size is sufficient for exploratory analysis and baseline supervised ML models.


In [16]:
# ==============================
# SECTION 3: DATASET STRUCTURE
# ==============================

rows, cols = df.shape
print("[STRUCTURE] Dataset dimensionality:")
print(f"  ➤ Number of Rows    : {rows}")
print(f"  ➤ Number of Columns : {cols}")

print("[INTERPRETATION]")
print("- Each row represents a passenger.")
print("- Dataset size is sufficient for exploratory analysis and baseline supervised ML models.")


[STRUCTURE] Dataset dimensionality:
  ➤ Number of Rows    : 891
  ➤ Number of Columns : 12
[INTERPRETATION]
- Each row represents a passenger.
- Dataset size is sufficient for exploratory analysis and baseline supervised ML models.


In [17]:
# ==============================
# SECTION 4: DATA PREVIEW
# ==============================

print("[PREVIEW] First 5 records:")
display(df.head())

print("[PREVIEW] Last 5 records:")
display(df.tail())


[PREVIEW] First 5 records:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


[PREVIEW] Last 5 records:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [18]:
# ==============================
# SECTION 5: SCHEMA INFORMATION
# ==============================

print("[SCHEMA] Column names, data types, and non-null counts:")
df.info()

print("[DATA QUALITY INTERPRETATION]")
print("- Numerical and categorical features are present.")
print("- Missing values detected in Age, Cabin, and Embarked.")
print("- Target variable 'Survived' has no missing values.")


[SCHEMA] Column names, data types, and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
[DATA QUALITY INTERPRETATION]
- Numerical and categorical features are present.
- Missing values detected in Age, Cabin, and Embarked.
- Target variable 'Survived' has no missing values.


In [19]:
# ==============================
# SECTION 6: STATISTICAL SUMMARY
# ==============================

print("[STATISTICS] Descriptive statistics:")
display(df.describe())

print("[INTERPRETATION]")
print("- Age shows a wide range requiring normalization.")
print("- Fare distribution is right-skewed and may benefit from transformation.")


[STATISTICS] Descriptive statistics:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


[INTERPRETATION]
- Age shows a wide range requiring normalization.
- Fare distribution is right-skewed and may benefit from transformation.


In [20]:
# ==============================
# SECTION 7: FEATURE TYPE CLASSIFICATION
# ==============================

feature_types = {
    "PassengerId": "Numerical (Identifier)",
    "Survived": "Binary (Target)",
    "Pclass": "Ordinal",
    "Name": "Categorical",
    "Sex": "Categorical",
    "Age": "Numerical",
    "SibSp": "Numerical",
    "Parch": "Numerical",
    "Ticket": "Categorical",
    "Fare": "Numerical",
    "Cabin": "Categorical",
    "Embarked": "Categorical"
}

print("[FEATURE TYPES]")
for k, v in feature_types.items():
    print(f"  ➤ {k:12s}: {v}")


[FEATURE TYPES]
  ➤ PassengerId : Numerical (Identifier)
  ➤ Survived    : Binary (Target)
  ➤ Pclass      : Ordinal
  ➤ Name        : Categorical
  ➤ Sex         : Categorical
  ➤ Age         : Numerical
  ➤ SibSp       : Numerical
  ➤ Parch       : Numerical
  ➤ Ticket      : Categorical
  ➤ Fare        : Numerical
  ➤ Cabin       : Categorical
  ➤ Embarked    : Categorical


In [21]:
# ==============================
# SECTION 8: DATA QUALITY CHECK
# ==============================

missing = df.isnull().sum()
print("[DATA QUALITY] Missing values per column:")
display(missing[missing > 0])

print("[INTERPRETATION]")
print("- Cabin has extensive missing values and may be excluded.")
print("- Age requires imputation.")
print("- Embarked has minimal missing values.")


[DATA QUALITY] Missing values per column:


Age         177
Cabin       687
Embarked      2
dtype: int64

[INTERPRETATION]
- Cabin has extensive missing values and may be excluded.
- Age requires imputation.
- Embarked has minimal missing values.


In [22]:
# ==============================
# SECTION 9: TARGET VARIABLE ANALYSIS
# ==============================

print("[TARGET ANALYSIS]")
print("- Target Variable: Survived")
print("- Problem Type: Binary Classification")

print("[TARGET DISTRIBUTION]")
display(df["Survived"].value_counts(normalize=True))

print("[NOTE] Target variable is complete and reliable for supervised learning.")


[TARGET ANALYSIS]
- Target Variable: Survived
- Problem Type: Binary Classification
[TARGET DISTRIBUTION]


Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

[NOTE] Target variable is complete and reliable for supervised learning.


In [23]:
# ==============================
# SECTION 10: ML READINESS ASSESSMENT
# ==============================

print("[ML READINESS SUMMARY]")
print("- Supervised Learning     : Supported")
print("- Missing Value Handling  : Required")
print("- Categorical Encoding    : Required")
print("- Class Imbalance         : Moderate")
print("- Dataset Size            : Suitable for baseline models")

print("[ARCHITECTURAL CONCLUSION]")
print("The Titanic dataset is suitable for supervised learning after standard preprocessing steps.")


[ML READINESS SUMMARY]
- Supervised Learning     : Supported
- Missing Value Handling  : Required
- Categorical Encoding    : Required
- Class Imbalance         : Moderate
- Dataset Size            : Suitable for baseline models
[ARCHITECTURAL CONCLUSION]
The Titanic dataset is suitable for supervised learning after standard preprocessing steps.


In [24]:
# ==============================
# SECTION 11: CONCLUSION
# ==============================

print("[FINAL CONCLUSION]")
print("- Dataset structure and data types are well understood.")
print("- Key data quality issues have been identified.")
print("- Dataset is ready for further ML experimentation.")


[FINAL CONCLUSION]
- Dataset structure and data types are well understood.
- Key data quality issues have been identified.
- Dataset is ready for further ML experimentation.
