In [5]:
# ================================
# TASK 1: DATASET UNDERSTANDING
# Titanic Dataset | One-Shot Program
# ================================

import pandas as pd
import numpy as np
import os

print("üöÄ Task 1: Understanding Dataset & Data Types Started")

# -------------------------------
# 1. CREATE PROJECT STRUCTURE
# -------------------------------
folders = ["data", "outputs", "report"]
for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("üìÅ Project folders created")

# -------------------------------
# 2. LOAD DATASET
# -------------------------------
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

df.to_csv("data/titanic.csv", index=False)
print("üìä Titanic dataset loaded and saved")

# -------------------------------
# 3. HEAD & TAIL RECORDS
# -------------------------------
head_tail = pd.concat([df.head(), df.tail()])
head_tail.to_csv("outputs/head_tail.csv", index=False)

# -------------------------------
# 4. DATASET INFO
# -------------------------------
from io import StringIO
buffer = StringIO()
df.info(buf=buffer)

with open("outputs/dataset_info.txt", "w") as f:
    f.write(buffer.getvalue())

# -------------------------------
# 5. STATISTICAL SUMMARY
# -------------------------------
describe_df = df.describe(include="all")
describe_df.to_csv("outputs/describe.csv")

# -------------------------------
# 6. FEATURE TYPE IDENTIFICATION
# -------------------------------
numerical = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical = df.select_dtypes(include=["object"]).columns.tolist()
binary = [col for col in df.columns if df[col].nunique() == 2]
ordinal = ["Pclass"]

# Save feature classification
with open("outputs/feature_types.txt", "w") as f:
    f.write(f"Numerical Features:\n{numerical}\n\n")
    f.write(f"Categorical Features:\n{categorical}\n\n")
    f.write(f"Binary Features:\n{binary}\n\n")
    f.write(f"Ordinal Features:\n{ordinal}\n")

# -------------------------------
# 7. UNIQUE VALUES IN CATEGORICAL
# -------------------------------
with open("outputs/categorical_unique_values.txt", "w") as f:
    for col in categorical:
        f.write(f"{col}:\n{df[col].unique()}\n\n")

# -------------------------------
# 8. TARGET & ML FEATURES
# -------------------------------
target = "Survived"
features = df.drop(columns=[target]).columns.tolist()

# -------------------------------
# 9. DATASET SIZE & QUALITY
# -------------------------------
rows, cols = df.shape
missing_values = df.isnull().sum()

ml_readiness_report = f"""
DATASET SIZE
------------
Rows: {rows}
Columns: {cols}

TARGET VARIABLE
---------------
{target}

INPUT FEATURES
--------------
{features}

MISSING VALUES
--------------
{missing_values}

ML SUITABILITY
--------------
- Supervised Classification Dataset
- Missing value handling required
- Categorical encoding needed
- Suitable for ML models after preprocessing
"""

with open("outputs/ml_readiness.txt", "w") as f:
    f.write(ml_readiness_report)

# -------------------------------
# 10. DATASET ANALYSIS REPORT
# -------------------------------
report_text = """
# Dataset Analysis Report ‚Äì Titanic Dataset

## Dataset Overview
The Titanic dataset consists of passenger details such as age, gender, ticket class, fare, and survival outcome.

## Feature Classification
- Numerical: Age, Fare, SibSp, Parch
- Categorical: Name, Sex, Ticket, Cabin, Embarked
- Binary: Survived, Sex
- Ordinal: Pclass

## Target Variable
Survived (Binary Classification)

## Data Quality Issues
- Missing values in Age, Cabin, and Embarked
- Cabin column has high missing percentage
- Class imbalance in survival data

## Machine Learning Readiness
The dataset is suitable for supervised ML tasks after preprocessing steps such as:
- Handling missing values
- Encoding categorical variables
- Feature scaling if needed

## Conclusion
This dataset is ideal for understanding data types, feature roles, and ML preparation.
"""

with open("report/Dataset_Analysis_Report.md", "w") as f:
    f.write(report_text)

# -------------------------------
# 11. README FILE
# -------------------------------
readme_text = """
# Task 1 ‚Äì Understanding Dataset & Data Types

## Objective
Analyze dataset structure, data types, feature roles, and ML readiness.

## Dataset
Titanic Dataset

## Tools Used
- Python
- Pandas
- NumPy
- Google Colab

## Deliverables
- Jupyter Notebook
- Dataset Analysis Report
- Saved outputs (CSV & TXT files)

## Key Learning
Understanding dataset structure is the foundation of Machine Learning.
"""

with open("README.md", "w") as f:
    f.write(readme_text)

# -------------------------------
# FINAL MESSAGE
# -------------------------------
print("‚úÖ TASK COMPLETED SUCCESSFULLY")
print("üìÇ Outputs saved automatically")
print("üìÑ Report & README generated")
print("üöÄ Ready for GitHub push")


üöÄ Task 1: Understanding Dataset & Data Types Started
üìÅ Project folders created
üìä Titanic dataset loaded and saved
‚úÖ TASK COMPLETED SUCCESSFULLY
üìÇ Outputs saved automatically
üìÑ Report & README generated
üöÄ Ready for GitHub push
