In [257]:
import pandas as pd
import numpy as np

In [259]:
df = pd.read_csv('goibibo_flights_data.csv')

In [260]:
df.head()

Unnamed: 0,flight date,airline,flight_num,class,from,dep_time,to,arr_time,duration,price,stops,Unnamed: 11,Unnamed: 12
0,26-06-2023,SpiceJet,SG-8709,economy,Delhi,18:55,Mumbai,21:05,02h 10m,6013,non-stop,,
1,26-06-2023,SpiceJet,SG-8157,economy,Delhi,06:20,Mumbai,08:40,02h 20m,6013,non-stop,,
2,26-06-2023,AirAsia,I5-764,economy,Delhi,04:25,Mumbai,06:35,02h 10m,6016,non-stop,,
3,26-06-2023,Vistara,UK-995,economy,Delhi,10:20,Mumbai,12:35,02h 15m,6015,non-stop,,
4,26-06-2023,Vistara,UK-963,economy,Delhi,08:50,Mumbai,11:10,02h 20m,6015,non-stop,,


In [263]:
# Drop Unnecessary Columns
df.drop(columns=["Unnamed: 11", "Unnamed: 12"], inplace=True, errors='ignore')

In [265]:
# Convert 'flight date' to datetime format and extract features
df["flight date"] = pd.to_datetime(df["flight date"], format="%d-%m-%Y")

In [267]:
# Extract features
df['day'] = df['flight date'].dt.day
df['month'] = df['flight date'].dt.month
df['year'] = df['flight date'].dt.year
df["day_of_week"] = df["flight date"].dt.day_name()

In [269]:
# Convert price column to numeric format if necessary
if df["price"].dtype == "object":
    df["price"] = df["price"].str.replace(",", "").astype(int)

In [271]:
# Convert 'duration' to total minutes

def convert_duration(duration):
    if pd.isna(duration) or not isinstance(duration, str):
        return np.nan  # Handle missing or incorrect values

    duration = duration.strip()
    h, m = 0, 0
    parts = duration.lower().replace(' ', '').split('h')

    try:
        if len(parts) > 1:  # If hours and minutes exist
            h = int(float(parts[0]))  # Handle cases like '1.03h'
            if 'm' in parts[1]:
                m = int(float(parts[1].replace('m', '')))
        elif 'm' in parts[0]:  # If only minutes exist
            m = int(float(parts[0].replace('m', '')))
    except ValueError:
        return np.nan  # Handle conversion errors

    return h * 60 + m

df['duration'] = df['duration'].apply(convert_duration)
df.dropna(subset=['duration'], inplace=True)  # Remove rows with invalid durations
df['duration'] = df['duration'].astype(int) 

In [273]:
# Function to convert stops to numerical values
def convert_stops(value):
    if pd.isna(value):
        return 1 
    if "non-stop" in value:
        return 0
    parts = value.split('-')
    return int(parts[0]) if parts[0].isdigit() else 2  

# Apply the function
df["stops"] = df["stops"].apply(convert_stops)

In [275]:
df.head(3)

Unnamed: 0,flight date,airline,flight_num,class,from,dep_time,to,arr_time,duration,price,stops,day,month,year,day_of_week
0,2023-06-26,SpiceJet,SG-8709,economy,Delhi,18:55,Mumbai,21:05,130,6013,0,26,6,2023,Monday
1,2023-06-26,SpiceJet,SG-8157,economy,Delhi,06:20,Mumbai,08:40,140,6013,0,26,6,2023,Monday
2,2023-06-26,AirAsia,I5-764,economy,Delhi,04:25,Mumbai,06:35,130,6016,0,26,6,2023,Monday


In [277]:
# Rename columns for clarity

new_column_names = {
    "flight date": "Flight Date",
    "airline": "Airline",
    "flight_num": "Flight Number",
    "class": "Class",
    "to": "Destination",
    "from": "Origin",
    "dep_time": "Departure Time",
    "arr_time": "Arrival Time",
    "price": "Price (₹)",
    "stops": "Number of Stops",
    "day": "Date",
    "month": "Month",
    "year": "Year",
    "duration": "Duration (Minutes)",
    "day_of_week": "Day"
}

df.rename(columns=new_column_names, inplace=True)
print("Columns renamed successfully!")

Columns renamed successfully!


In [279]:
df.head(3)

Unnamed: 0,Flight Date,Airline,Flight Number,Class,Origin,Departure Time,Destination,Arrival Time,Duration (Minutes),Price (₹),Number of Stops,Date,Month,Year,Day
0,2023-06-26,SpiceJet,SG-8709,economy,Delhi,18:55,Mumbai,21:05,130,6013,0,26,6,2023,Monday
1,2023-06-26,SpiceJet,SG-8157,economy,Delhi,06:20,Mumbai,08:40,140,6013,0,26,6,2023,Monday
2,2023-06-26,AirAsia,I5-764,economy,Delhi,04:25,Mumbai,06:35,130,6016,0,26,6,2023,Monday


In [281]:
df.to_csv("processed_flight_prices.csv", index=False)
print("CSV file saved successfully!")

CSV file saved successfully!


In [283]:
df.dtypes

Flight Date           datetime64[ns]
Airline                       object
Flight Number                 object
Class                         object
Origin                        object
Departure Time                object
Destination                   object
Arrival Time                  object
Duration (Minutes)             int32
Price (₹)                      int32
Number of Stops                int64
Date                           int32
Month                          int32
Year                           int32
Day                           object
dtype: object