In [None]:
"/Users/alexo/Desktop/BigDataAnalytics/project/raw_dataset/2008_airline.csv"

# create_january_subset.py
# Run this ONCE on your Mac (outside Docker) to create a manageable dataset

import pandas as pd
import os
from pathlib import Path

# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
DATA_DIR = Path("/Users/alexo/Desktop/BigDataAnalytics/project/raw_dataset/")
INPUT_FILE = DATA_DIR / "2008_airline.csv"  # Your original full file
OUTPUT_FILE = "/Users/alexo/Desktop/BigDataAnalytics/BigDataProject/flight-delay-forecasting/data/jan_2008.csv"  # Final result

# Create data folder if missing
DATA_DIR.mkdir(exist_ok=True)

print("Reading full 2008 dataset with pandas (this takes ~10-15 seconds)...")
print(f"Looking for: {INPUT_FILE.resolve()}")

if not INPUT_FILE.exists():
    raise FileNotFoundError(
        f"Cannot find {INPUT_FILE}. Make sure 2008.csv is in the data/ folder!"
    )

# Read in chunks to be memory-friendly (even though pandas can handle 7M easily on modern Macs)
chunksize = 500_000
total_rows = 0
jan_rows = 0

# We'll write directly to the output file with header only once
header_written = False

print("Filtering January 2008 (Month == 1)...")
for i, chunk in enumerate(
    pd.read_csv(INPUT_FILE, chunksize=chunksize, dtype=str)
):  # dtype=str avoids type guessing issues
    total_rows += len(chunk)

    # Filter January
    jan_chunk = chunk[chunk["Month"] == "1"]
    jan_rows += len(jan_chunk)

    # Append to output file
    jan_chunk.to_csv(OUTPUT_FILE, mode="a", header=not header_written, index=False)
    header_written = True

    if (i + 1) % 5 == 0:
        print(f"   Processed {total_rows:,} rows so far...")

print("\n" + "=" * 60)
print("SUCCESS! January 2008 subset created")
print("=" * 60)
print(f"Input file : {INPUT_FILE.name} → {total_rows:,} total rows")
print(f"Output file: {OUTPUT_FILE.name} → {jan_rows:,} rows (Month == 1)")
print(f"Size       : ~180–220 MB (perfect for Spark workers!)")
print("=" * 60)
print(f"File saved to: {OUTPUT_FILE.resolve()}")

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
DATA_PATH = Path("./jan_2008.csv")  # <-- Change only if you moved the file

if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

print("Loading January 2008 dataset...")
df = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns\n")

# ------------------------------------------------------------------
# 1. Basic Info
# ------------------------------------------------------------------
print("=" * 60)
print("1. BASIC INFORMATION")
print("=" * 60)
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

# ------------------------------------------------------------------
# 2. Missing Values
# ------------------------------------------------------------------
print("\n" + "=" * 60)
print("2. MISSING VALUES (sorted)")
print("=" * 60)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame(
    {"Missing Count": missing, "Missing %": missing_pct.round(2)}
).sort_values(by="Missing %", ascending=False)
print(missing_df[missing_df["Missing Count"] > 0])

# ------------------------------------------------------------------
# 3. Target Variable: ArrDelay
# ------------------------------------------------------------------
print("\n" + "=" * 60)
print("3. TARGET VARIABLE: ArrDelay (Arrival Delay in minutes)")
print("=" * 60)
print(df["ArrDelay"].describe())
print(
    f"Flights on time or early: {(df['ArrDelay'] <= 0).sum():,} ({(df['ArrDelay'] <= 0).mean()*100:.1f}%)"
)
print(
    f"Delayed flights (>15 min):  {(df['ArrDelay'] > 15).sum():,} ({(df['ArrDelay'] > 15).mean()*100:.1f}%)"
)

# ------------------------------------------------------------------
# 4. Key Categorical Features
# ------------------------------------------------------------------
print("\n" + "=" * 60)
print("4. CATEGORICAL FEATURES SUMMARY")
print("=" * 60)
cat_cols = ["UniqueCarrier", "Origin", "Dest", "DayOfWeek", "Month"]
for col in cat_cols:
    print(
        f"{col:15}: {df[col].nunique():4} unique values | Top: {df[col].value_counts().index[0]} ({df[col].value_counts().iloc[0]:,})"
    )

# ------------------------------------------------------------------
# 5. Visualizations (saved to eda_plots/ folder)
# ------------------------------------------------------------------
PLOT_DIR = Path("eda_plots")
PLOT_DIR.mkdir(exist_ok=True)
sns.set(style="whitegrid", font_scale=1.1)
plt.rcParams["figure.figsize"] = (12, 6)

# Plot 1: Arrival Delay Distribution
plt.figure()
sns.histplot(data=df, x="ArrDelay", bins=100, kde=True, color="skyblue")
plt.xlim(-100, 200)
plt.title("Distribution of Arrival Delay (minutes) - January 2008")
plt.xlabel("Arrival Delay (negative = early)")
plt.ylabel("Number of Flights")
plt.axvline(0, color="green", linestyle="--", label="On Time")
plt.axvline(15, color="red", linestyle="--", label="Delayed >15 min")
plt.legend()
plt.tight_layout()
plt.savefig(PLOT_DIR / "01_arrdelay_distribution.png", dpi=150)
plt.close()

# Plot 2: Delay by Day of Week
plt.figure()
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
day_map = {
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday",
    7: "Sunday",
}
df_day = df.copy()
df_day["DayName"] = df_day["DayOfWeek"].map(day_map)
sns.boxplot(data=df_day, x="DayName", y="ArrDelay", order=order, palette="Set2")
plt.title("Arrival Delay by Day of Week")
plt.xticks(rotation=45)
plt.ylabel("Arrival Delay (min)")
plt.ylim(-50, 100)
plt.tight_layout()
plt.savefig(PLOT_DIR / "02_delay_by_dayofweek.png", dpi=150)
plt.close()

# Plot 3: Top 10 Most Delayed Carriers
plt.figure()
top_carriers = (
    df.groupby("UniqueCarrier")["ArrDelay"].mean().sort_values(ascending=False).head(10)
)
sns.barplot(x=top_carriers.values, y=top_carriers.index, palette="Reds_r")
plt.title("Average Arrival Delay by Carrier (Top 10 Worst)")
plt.xlabel("Average Delay (min)")
plt.tight_layout()
plt.savefig(PLOT_DIR / "03_worst_carriers.png", dpi=150)
plt.close()

# Plot 4: Correlation Heatmap (numeric features only)
print("\n" + "=" * 60)
print("5. CORRELATION MATRIX (numeric features)")
print("=" * 60)
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr = df[numeric_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0, fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Numeric Features")
plt.tight_layout()
plt.savefig(PLOT_DIR / "04_correlation_heatmap.png", dpi=150)
plt.close()

print(f"\nAll plots saved to: {PLOT_DIR.resolve()}")
print("\nEDA COMPLETED SUCCESSFULLY!")
print("You can now use these insights for:")
print("   • Feature selection")
print("   • Handling missing values (especially CancellationCode, CarrierDelay, etc.)")
print(
    "   • Deciding which columns to drop (e.g., TailNum, ActualElapsedTime if redundant)"
)
print("   • Justifying your preprocessing steps in the report")

Loading January 2008 dataset...
Dataset loaded: 605,765 rows × 29 columns

1. BASIC INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605765 entries, 0 to 605764
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ActualElapsedTime  587130 non-null  float64
 1   AirTime            587130 non-null  float64
 2   ArrDelay           587130 non-null  float64
 3   ArrTime            587130 non-null  float64
 4   CRSArrTime         605765 non-null  int64  
 5   CRSDepTime         605765 non-null  int64  
 6   CRSElapsedTime     605659 non-null  float64
 7   CancellationCode   17308 non-null   object 
 8   Cancelled          605765 non-null  int64  
 9   CarrierDelay       148807 non-null  float64
 10  DayOfWeek          605765 non-null  int64  
 11  DayofMonth         605765 non-null  int64  
 12  DepDelay           588457 non-null  float64
 13  DepTime            588457 non-null  float64
 14  Dest


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df_day, x="DayName", y="ArrDelay", order=order, palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_carriers.values, y=top_carriers.index, palette="Reds_r")



5. CORRELATION MATRIX (numeric features)

All plots saved to: /Users/alexo/Desktop/BigDataAnalytics/BigDataProject/flight-delay-forecasting/data/eda_plots

EDA COMPLETED SUCCESSFULLY!
You can now use these insights for:
   • Feature selection
   • Handling missing values (especially CancellationCode, CarrierDelay, etc.)
   • Deciding which columns to drop (e.g., TailNum, ActualElapsedTime if redundant)
   • Justifying your preprocessing steps in the report
