In [19]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [20]:
df = pd.read_csv('flight-price-prediction/clean_data.csv')
df.head()

Unnamed: 0,Airline,Source,Departure Time,Stops,Arrival Time,Destination,Class,Duration,Days Left,Price
0,SpiceJet,Delhi,Evening,0,Night,Mumbai,Economy,130,1,5953
1,SpiceJet,Delhi,Early_Morning,0,Morning,Mumbai,Economy,140,1,5953
2,AirAsia,Delhi,Early_Morning,0,Early_Morning,Mumbai,Economy,130,1,5956
3,Vistara,Delhi,Morning,0,Afternoon,Mumbai,Economy,135,1,5955
4,Vistara,Delhi,Morning,0,Morning,Mumbai,Economy,140,1,5955


In [21]:
X = df.drop('Price', axis=1)
y = df['Price']

In [22]:
# Define categorical and numerical columns
categorical_cols = ['Airline', 'Source', 'Departure Time', 'Arrival Time', 'Destination', 'Class']
numerical_cols = ['Stops', 'Duration', 'Days Left']

In [23]:
# Create the Preprocessing Engine
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

In [24]:
#  Model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=30,       # Fewer trees
        max_depth=15,          # Stop trees from growing infinitely deep
        min_samples_split=10,  # Prevent hyper-specific branching
        random_state=42,
        n_jobs=-1              # Train faster
    ))
])

In [25]:
# Train the whole pipeline
print("Training the pipeline (this might take a minute)...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_pipeline.fit(X_train, y_train)

Training the pipeline (this might take a minute)...


In [26]:
# Save the pipeline!
import os
os.makedirs('models', exist_ok=True)
joblib.dump(model_pipeline, 'models/flight_pipeline.pkl')
print("Pipeline saved successfully to models/flight_pipeline.pkl!")

Pipeline saved successfully to models/flight_pipeline.pkl!
