In [1]:
import polars as pl

class FlightDataProcessor:
    def __init__(self, file_path):
        self.df = pl.read_csv(file_path)
    
    def convert_hhmm_to_decimal(self, columns):
        self.df = self.df.with_columns([
            ((pl.col(col) // 100) + (pl.col(col) % 100) / 60).round(2).alias(col)
            for col in columns
        ])
    
    def convert_minutes_to_hours(self, columns):
        self.df = self.df.with_columns([
            (pl.col(col) / 60).round(2).alias(col)
            for col in columns
        ])
    
    def drop_unwanted_columns(self, columns):
        self.df = self.df.drop(columns)
    
    def encode_categorical_columns(self, columns):
        for col in columns:
            unique_values = self.df[col].unique().to_list()
            encoding_map = {value: idx for idx, value in enumerate(unique_values)}
            self.df = self.df.with_columns(pl.col(col).replace(encoding_map).cast(pl.Int64).alias(col))
    
    def handle_missing_values(self):
        self.df = self.df.drop_nulls()
    
    def remove_duplicates(self):
        self.df = self.df.unique()
    
    def process_data(self):
        hhmm_columns = ["SCHEDULED_DEPARTURE", "DEPARTURE_TIME", "WHEELS_OFF", 
                        "WHEELS_ON", "SCHEDULED_ARRIVAL", "ARRIVAL_TIME"]
        minute_columns = ["DEPARTURE_DELAY", "TAXI_OUT", "SCHEDULED_TIME", "ELAPSED_TIME",
                          "AIR_TIME", "TAXI_IN", "ARRIVAL_DELAY", "AIR_SYSTEM_DELAY",
                          "SECURITY_DELAY", "AIRLINE_DELAY", "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"]
        categorical_columns = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"]
        columns_to_drop = ["CANCELLATION_REASON", "TAIL_NUMBER"]
        
        self.convert_hhmm_to_decimal(hhmm_columns)
        self.convert_minutes_to_hours(minute_columns)
        self.drop_unwanted_columns(columns_to_drop)
        self.encode_categorical_columns(categorical_columns)
        self.handle_missing_values()
        self.remove_duplicates()
    
    def save_to_csv(self, output_path):
        self.df.write_csv(output_path)
    
    def display(self):
        print(self.df)

# Usage example
processor = FlightDataProcessor("flights.csv")
processor.process_data()
processor.display()
processor.save_to_csv("flights_cleaned.csv")



shape: (1_063_439, 29)
┌──────┬───────┬─────┬─────────────┬───┬────────────────┬───────────────┬─────────────────────┬───────────────┐
│ YEAR ┆ MONTH ┆ DAY ┆ DAY_OF_WEEK ┆ … ┆ SECURITY_DELAY ┆ AIRLINE_DELAY ┆ LATE_AIRCRAFT_DELAY ┆ WEATHER_DELAY │
│ ---  ┆ ---   ┆ --- ┆ ---         ┆   ┆ ---            ┆ ---           ┆ ---                 ┆ ---           │
│ i64  ┆ i64   ┆ i64 ┆ i64         ┆   ┆ f64            ┆ f64           ┆ f64                 ┆ f64           │
╞══════╪═══════╪═════╪═════════════╪═══╪════════════════╪═══════════════╪═════════════════════╪═══════════════╡
│ 2015 ┆ 10    ┆ 2   ┆ 5           ┆ … ┆ 0.0            ┆ 0.0           ┆ 0.87                ┆ 0.0           │
│ 2015 ┆ 1     ┆ 20  ┆ 2           ┆ … ┆ 0.0            ┆ 0.0           ┆ 0.0                 ┆ 0.0           │
│ 2015 ┆ 4     ┆ 21  ┆ 2           ┆ … ┆ 0.0            ┆ 0.57          ┆ 0.0                 ┆ 0.0           │
│ 2015 ┆ 7     ┆ 24  ┆ 5           ┆ … ┆ 0.0            ┆ 0.0           ┆ 1.88   