In [31]:
import yaml

with open('/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Example usage:
raw_data_path = config['data']['raw']


ModuleNotFoundError: No module named 'yaml'

In [32]:
# Cell 1: Import libraries and load data
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('../data/raw/flight_data.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Cell 2: Basic data inspection
print("=== BASIC DATA INFORMATION ===")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")
print("\n=== COLUMN NAMES ===")
print(df.columns.tolist())


: 

In [None]:
# Cell 3: Data types and info
print("=== DATA TYPES AND INFO ===")
print(df.info())
print("\n=== FIRST 5 ROWS ===")
df.head()


: 

In [None]:
# Cell 4: Missing values analysis
print("=== MISSING VALUES ANALYSIS ===")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
}).sort_values('Missing_Count', ascending=False)

print(missing_df[missing_df['Missing_Count'] > 0])


: 

In [None]:
# Cell 5: Basic statistics
print("=== BASIC STATISTICS ===")
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(df[numerical_cols].describe())


: 

In [None]:
# Cell 6: Check for duplicates
print("=== DUPLICATE ANALYSIS ===")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df.shape}")


: 

In [None]:
# Cell 7: Data quality checks
print("=== DATA QUALITY CHECKS ===")

# Check for negative values in flight counts
print("Negative values in arr_flights:", (df['arr_flights'] < 0).sum())

# Check for unrealistic delay values
print("Extremely high delays (>1440 minutes - 24 hours):", (df['arr_delay'] > 1440).sum())
print("Extremely low delays (<-1440 minutes):", (df['arr_delay'] < -1440).sum())

# Check date ranges
print(f"Year range: {df['year'].min()} to {df['year'].max()}")
print(f"Month range: {df['month'].min()} to {df['month'].max()}")


: 

In [None]:
# Cell 8: Handle missing values and basic cleaning
print("=== DATA CLEANING ===")

# Fill missing values with 0 for delay-related columns (assuming no delay if missing)
delay_columns = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
for col in delay_columns:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Fill missing count columns with 0
count_columns = ['carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']
for col in count_columns:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Handle missing arr_delay values
if df['arr_delay'].isnull().sum() > 0:
    print(f"Filling {df['arr_delay'].isnull().sum()} missing arr_delay values with 0")
    df['arr_delay'] = df['arr_delay'].fillna(0)

print("Data cleaning completed!")


: 

In [None]:
# Cell 9: Save cleaned data
# Create processed directory if it doesn't exist
import os
os.makedirs('../data/processed', exist_ok=True)

# Save cleaned data
df.to_csv('../data/processed/cleaned_flight_data.csv', index=False)
print("Cleaned data saved to '../data/processed/cleaned_flight_data.csv'")
print(f"Final dataset shape: {df.shape}")


: 