In [15]:
# Import necessary libraries
import pandas as pd
import re

# File paths
flight_price_path = r"C:\Users\ashwi\GUVI_Projects\Flight Project\Flight_Price.csv"

# Load dataset
flight_price_df = pd.read_csv(flight_price_path)

# Handle missing values in 'Route' and 'Total_Stops'
flight_price_df['Route'] = flight_price_df['Route'].fillna("Unknown")
flight_price_df['Total_Stops'] = flight_price_df['Total_Stops'].fillna("Unknown")

# Feature engineering for 'Date_of_Journey'
flight_price_df['Date_of_Journey'] = pd.to_datetime(flight_price_df['Date_of_Journey'], format='%d/%m/%Y')
flight_price_df['Day_of_Journey'] = flight_price_df['Date_of_Journey'].dt.day
flight_price_df['Month_of_Journey'] = flight_price_df['Date_of_Journey'].dt.month
flight_price_df['Year_of_Journey'] = flight_price_df['Date_of_Journey'].dt.year

# Simplify 'Route' to show only start and end destinations and replace '?' with 'to'
def process_route(route):
    stops = route.split(' ? ')  # Handle the '?' character
    
    # Handle routes with no intermediate stops
    if len(stops) > 1:  # If there are intermediate stops
        intermediate_stops = stops[1:-1]
        end_to_end = stops[0] + " to " + stops[-1]
        intermediate_stops_text = ", ".join(intermediate_stops)
    else:  # If there are no intermediate stops
        end_to_end = route.replace('?', 'to')  # Replace '?' with 'to'
        intermediate_stops_text = ""  # No intermediate stops
    
    return end_to_end, intermediate_stops_text

flight_price_df['Route'], flight_price_df['Intermediate_Stops'] = zip(*flight_price_df['Route'].apply(process_route))

# Replace empty Intermediate_Stops with "NIL"
flight_price_df['Intermediate_Stops'] = flight_price_df['Intermediate_Stops'].replace("", "NIL")

# Parse 'Duration' into total minutes and calculate days
flight_price_df['Duration_Days'] = 0

def parse_duration(duration):
    hours = 0
    minutes = 0
    if 'h' in duration:
        hours = int(duration.split('h')[0])
    if 'm' in duration:
        minutes = int(duration.split('h')[-1].replace('m', '').strip())
    total_minutes = hours * 60 + minutes
    days = total_minutes // 1440
    remaining_minutes = total_minutes % 1440
    return days, remaining_minutes

flight_price_df[['Duration_Days', 'Duration_Minutes']] = pd.DataFrame(
    flight_price_df['Duration'].apply(parse_duration).tolist(), index=flight_price_df.index
)

# Convert 'Total_Stops' to integers
stops_mapping = {
    'non-stop': 0,
    '1 stop': 1,
    '2 stops': 2,
    '3 stops': 3,
    '4 stops': 4,
    'Unknown': -1
}
flight_price_df['Total_Stops'] = flight_price_df['Total_Stops'].map(stops_mapping)

# Keep only the time part in 'Arrival_Time' in HH:MM format
def extract_time(arrival_time):
    try:
        if isinstance(arrival_time, str) and arrival_time.strip():  # Check if arrival_time is a non-empty string
            # Extract the time portion (HH:MM)
            match = re.search(r'(\d{2}:\d{2})', arrival_time)
            if match:
                return match.group(1)  # Return matched time (HH:MM)
            else:
                return None  # Return None if no valid time is found
        else:
            return None
    except Exception as e:
        return None

flight_price_df['Arrival_Time'] = flight_price_df['Arrival_Time'].apply(extract_time)

# Keep 'Additional_Info' as text
flight_price_df['Additional_Info'] = flight_price_df['Additional_Info']

# Select relevant columns
flight_price_cleaned = flight_price_df[[
    'Airline', 'Day_of_Journey', 'Month_of_Journey', 'Year_of_Journey', 'Source', 'Destination', 'Route',
    'Intermediate_Stops', 'Dep_Time', 'Arrival_Time', 'Duration_Days', 'Duration_Minutes', 'Total_Stops', 'Additional_Info', 'Price'
]]

# Save cleaned data
cleaned_path = r"C:\Users\ashwi\GUVI_Projects\Flight Project\Flight_Price_Cleaned.csv"
flight_price_cleaned.to_csv(cleaned_path, index=False)

# Print confirmation
print(f"Flight Price dataset cleaned and saved")


Flight Price dataset cleaned and saved as C:\Users\ashwi\GUVI_Projects\Flight Project\Flight_Price_Cleaned.csv
