## Importing Labraries

In [1]:
import pandas as pd
import numpy as np

# Load and Explore Dataset

In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,Airline Name,Flight Number,Number of Stops,Flight Duration,Ticket Class,Refundable Ticket,Baggage Allowance,Season,Weather Condition,Flight Price
0,GoAir,AI1190,0,2h 5m,Business,Yes,30kg,Spring,Stormy,12258
1,GoAir,AI1132,2,4h 30m,Economy,Yes,15kg,Summer,Rainy,8564
2,AirAsia India,AI1457,1,2h 49m,Economy,Yes,25kg,Spring,Clear,6730
3,AirAsia India,AI1450,0,2h 18m,Economy,No,30kg,Autumn,Foggy,4528
4,Vistara,AI1174,2,4h 19m,Economy,Yes,25kg,Spring,Foggy,4708
...,...,...,...,...,...,...,...,...,...,...
1995,AirAsia India,AI1275,1,3h 18m,Economy,No,25kg,Autumn,Stormy,5336
1996,Vistara,AI1498,0,2h 8m,Economy,No,20kg,Winter,Rainy,7191
1997,AirAsia India,AI1133,1,3h 11m,Economy,Yes,20kg,Spring,Foggy,4889
1998,Akasa Air,AI1015,2,4h 34m,Economy,No,30kg,Summer,Clear,4137


In [3]:
print("Shape of dataset:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing values:\n", df.isnull().sum())
df.describe()

Shape of dataset: (2000, 10)
Columns: ['Airline Name', 'Flight Number', 'Number of Stops', 'Flight Duration', 'Ticket Class', 'Refundable Ticket', 'Baggage Allowance', 'Season', 'Weather Condition', 'Flight Price']
Missing values:
 Airline Name         0
Flight Number        0
Number of Stops      0
Flight Duration      0
Ticket Class         0
Refundable Ticket    0
Baggage Allowance    0
Season               0
Weather Condition    0
Flight Price         0
dtype: int64


Unnamed: 0,Number of Stops,Flight Price
count,2000.0,2000.0
mean,0.9815,8524.861
std,0.824923,3501.550717
min,0.0,2751.0
25%,0.0,5886.0
50%,1.0,7761.5
75%,2.0,10708.75
max,2.0,18915.0


# Data Preprocessing

In [4]:
df = df.drop(columns=['Flight Number', 'Refundable Ticket', 'Number of Stops', 'Flight Duration'])

In [5]:
df

Unnamed: 0,Airline Name,Ticket Class,Baggage Allowance,Season,Weather Condition,Flight Price
0,GoAir,Business,30kg,Spring,Stormy,12258
1,GoAir,Economy,15kg,Summer,Rainy,8564
2,AirAsia India,Economy,25kg,Spring,Clear,6730
3,AirAsia India,Economy,30kg,Autumn,Foggy,4528
4,Vistara,Economy,25kg,Spring,Foggy,4708
...,...,...,...,...,...,...
1995,AirAsia India,Economy,25kg,Autumn,Stormy,5336
1996,Vistara,Economy,20kg,Winter,Rainy,7191
1997,AirAsia India,Economy,20kg,Spring,Foggy,4889
1998,Akasa Air,Economy,30kg,Summer,Clear,4137


In [6]:
df = df.dropna()

categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

from sklearn.preprocessing import StandardScaler

num_cols = df_encoded.select_dtypes(include=[np.number]).columns.drop('Flight Price', errors='ignore')

scaler = StandardScaler()
if not num_cols.empty:
    df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

df_encoded.head()

Unnamed: 0,Flight Price,Airline Name_AirAsia India,Airline Name_Akasa Air,Airline Name_GoAir,Airline Name_IndiGo,Airline Name_SpiceJet,Airline Name_Vistara,Ticket Class_Economy,Baggage Allowance_20kg,Baggage Allowance_25kg,Baggage Allowance_30kg,Season_Spring,Season_Summer,Season_Winter,Weather Condition_Foggy,Weather Condition_Rainy,Weather Condition_Snowy,Weather Condition_Stormy,Weather Condition_Windy
0,12258,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False
1,8564,False,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False
2,6730,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False
3,4528,True,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False
4,4708,False,False,False,False,False,True,True,False,True,False,True,False,False,True,False,False,False,False


# Feature Engineering

In [7]:
if 'Date_of_Journey' in df_encoded.columns:
    df_encoded['Journey_Day'] = pd.to_datetime(df['Date_of_Journey']).dt.day
    df_encoded['Journey_Month'] = pd.to_datetime(df['Date_of_Journey']).dt.month
    df_encoded = df_encoded.drop(columns=['Date_of_Journey'])
df_encoded.head()

Unnamed: 0,Flight Price,Airline Name_AirAsia India,Airline Name_Akasa Air,Airline Name_GoAir,Airline Name_IndiGo,Airline Name_SpiceJet,Airline Name_Vistara,Ticket Class_Economy,Baggage Allowance_20kg,Baggage Allowance_25kg,Baggage Allowance_30kg,Season_Spring,Season_Summer,Season_Winter,Weather Condition_Foggy,Weather Condition_Rainy,Weather Condition_Snowy,Weather Condition_Stormy,Weather Condition_Windy
0,12258,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False
1,8564,False,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False
2,6730,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False
3,4528,True,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False
4,4708,False,False,False,False,False,True,True,False,True,False,True,False,False,True,False,False,False,False


# Split Data into Train and Test Sets

In [8]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('Flight Price', axis=1)
y = df_encoded['Flight Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Multiple Regression Models

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

Linear Regression trained.
Random Forest trained.


# Evaluate Model Performance

In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

Linear Regression - RMSE: 2570.05, MAE: 2079.22, R²: 0.4493
Random Forest - RMSE: 3006.63, MAE: 2411.66, R²: 0.2463


# Save the Best Model

In [11]:
import joblib

best_model = models["Linear Regression"]
joblib.dump(best_model, 'best_price_model.joblib')
print("Best model saved as 'best_price_model.joblib'")

Best model saved as 'best_price_model.joblib'


# Prediction

In [12]:
predictions = best_model.predict(X_test)
print(predictions)

[10604.59203053 10903.09093321  6788.28550158  5542.97320078
 10321.47812995 11727.94911646  6466.00993888 11800.19578024
  6903.56270334 10225.7807144   5643.95661808 11495.12853113
  6346.67616233  5748.1967868  10909.65927964 10846.4427494
  5620.41115633 11111.15463338  5310.37827955 10969.86967008
 11563.94238351 11762.72488757  6356.58200414  6389.54824983
 10671.83955006  6841.35604924  6610.63735619 11393.39623933
  6776.34537537 11765.92835036  6244.27719887  6646.38052994
  6648.79093125 10544.62222208 10448.69545793 11526.61365621
  5780.33871822 11242.45988228 11225.9800079  10745.95853645
 10519.70179961  5559.32012454  5676.65484577  5975.74356145
  5375.63294261 10455.03682779 11459.02994183 11366.74202477
  5828.67562117  5746.59081423  5660.77047054  5843.55524846
 10328.13724996 11229.36996435  6389.38413948  6599.40883933
 10719.72462577  6779.26411791  6206.8937006   5965.83771965
 11425.77279379  5531.72382089  6945.82466117  5980.63070846
  5755.97713015  6906.481

In [13]:
from sklearn.metrics import accuracy_score
svc_accuracy=accuracy_score(y_test,svc_predictions)

NameError: name 'svc_predictions' is not defined