<a href="https://colab.research.google.com/github/A-purv-Ai/isro-analysis/blob/main/02_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Block 15: Setup & Load Raw Data

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("BLOCK 15: SETUP & LOAD RAW DATA")
print("=" * 80)

# Define paths
BASE_DIR = '/content/drive/MyDrive/ISRO_Launch_Risk_Prediction'
RAW_DATA_PATH = f'{BASE_DIR}/data/raw/isro_launch_history_raw.csv'

# Load data
try:
    df_raw = pd.read_csv(RAW_DATA_PATH)
    print(f"\n Successfully loaded raw data")
    print(f"   Shape: {df_raw.shape[0]} rows Ã— {df_raw.shape[1]} columns")
    print(f"   Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024:.2f} KB")

    # Display first few rows
    print(f"\n First 3 records:")
    print(df_raw.head(3).to_string())

except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_DATA_PATH}")
    print("Please run 01_Data_Acquisition.ipynb first!")
except Exception as e:
    print(f"ERROR: {e}")

print("\n" + "=" * 80)


In [None]:
# @title Block 16: Initial Data Inspection

print("=" * 80)
print("BLOCK 16: INITIAL DATA INSPECTION")
print("=" * 80)

# Data info
print("\n COLUMN INFORMATION:")
print(df_raw.dtypes)

# Missing values
print("\n MISSING VALUES:")
missing = df_raw.isnull().sum()
missing_pct = (missing / len(df_raw) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing_Count': missing.values,
    'Missing_Percentage': missing_pct.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("No missing values found! ")

# Duplicates
duplicates = df_raw.duplicated().sum()
print(f"\n DUPLICATE ROWS: {duplicates}")

# Unique values per column
print("\n UNIQUE VALUE COUNTS:")
for col in df_raw.columns:
    n_unique = df_raw[col].nunique()
    print(f"   {col:30s}: {n_unique:4d} unique values")

# Value distributions for key categorical columns
print("\n KEY CATEGORICAL DISTRIBUTIONS:")
print("\n Rocket Type:")
print(df_raw['rocket_type'].value_counts().to_string())

print("\n   Launch Outcome:")
print(df_raw['launch_outcome'].value_counts().to_string())

print("\n" + "=" * 80)
