In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder
from category_encoders import BinaryEncoder

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score ,accuracy_score




In [2]:
def wrangle(link):
    df = pd.read_excel(link)
    df.dropna(inplace = True)    
    
    # get arrived time by hour
    df["Arrival_Time_hour"] = pd.to_datetime(df["Arrival_Time"].str.split(" ", expand = True)[0]).dt.time
    df["Dep_Time"] = pd.to_datetime(df["Dep_Time"]).dt.hour
    
#     get duration integer by hour from Duration
    df['hours'] = df["Duration"].str.split(" ",expand = True,)[0].str[:-1].astype(int)
    df["min"] = df["Duration"].str.split(" ",expand = True)[1].str[:-1]
    # fill NaN values with 0

    df['min'].fillna(0, inplace=True)
    df['min'] = df['min'].astype(int)
    # combine hours and minutes into a single column
    df['duration'] = df.apply(lambda x: x['hours'] + x['min'] / 60, axis=1)

    # drop original columns
    df.drop(['hours', 'min','Arrival_Time',"Duration"], axis=1, inplace=True)
    
    # edit in total stops non-stop to 0 stop
    index_non_stop  = df[df['Total_Stops'] == 'non-stop'].index
    df.loc[index_non_stop,"Total_Stops"] = "0 stops"
    
    # make total stops numbers
    df['Total_Stops'] = df['Total_Stops'].str.split(" ",expand = True)[0].astype(int)
    
    #drop Additional_Info becouse no info most the column
    df.drop(['Additional_Info'], axis=1, inplace=True)
    #drop Route not intersted
    df.drop(['Route'], axis=1, inplace=True)
    # handle date of journy i found the year  just 2019 and the months 3,5,6,4 and that is unusfull not good
    df.drop(['Date_of_Journey'], axis=1, inplace=True)
    
    # handle Dep_Time to make parts in day
    def get_part_of_day(hour):
        if 5 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 17:
            return 'afternoon'
        elif 17 <= hour < 21:
            return 'evening'
        else:
            return 'night'

    # apply function to datetime column and create new column for part of day
    df['part_of_day'] = df['Dep_Time'].apply(lambda x: get_part_of_day(x))
    
    df.drop(['Dep_Time','Arrival_Time_hour'], axis=1, inplace=True)
    return df

In [3]:
df = wrangle("Data_Train.xlsx")

In [4]:
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,duration,part_of_day
0,IndiGo,Banglore,New Delhi,0,3897,2.833333,night
1,Air India,Kolkata,Banglore,2,7662,7.416667,morning
2,Jet Airways,Delhi,Cochin,2,13882,19.0,morning
3,IndiGo,Kolkata,Banglore,1,6218,5.416667,evening
4,IndiGo,Banglore,New Delhi,1,13302,4.75,afternoon


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10682 entries, 0 to 10682
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Airline      10682 non-null  object 
 1   Source       10682 non-null  object 
 2   Destination  10682 non-null  object 
 3   Total_Stops  10682 non-null  int32  
 4   Price        10682 non-null  int64  
 5   duration     10682 non-null  float64
 6   part_of_day  10682 non-null  object 
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 883.9+ KB


In [6]:
def get_best_regression_model(df):
   
    #encoding
    cat_df = df.select_dtypes("object") 
    
    ohe = OneHotEncoder()
    ohe.fit(cat_df)
    cat_df_trans = ohe.transform(cat_df)
        
    df.drop(columns =cat_df.columns, inplace = True)
    
    df = pd.concat([cat_df_trans,df],axis=1) 
     # split
    X = df.drop(columns = ['Price'])
    y = df["Price"]
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define a list of regression models to evaluate
    models = [
        LinearRegression(),
        DecisionTreeRegressor(),
        Ridge(),
        Lasso(),
        SVR(),
        RandomForestRegressor()
    ]
    
    # Train and evaluate each model
    best_model = None
    best_score = -float('inf')
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)
        print(f"{type(model).__name__} Accuracy: {score}")
        
        if score > best_score:
            best_score = score
            best_model = model
    
    print(f"\nBest model: {type(best_model).__name__} with accuracy {best_score}")
    return best_model

In [7]:
get_best_regression_model(df)

LinearRegression Accuracy: 0.5909845227454431
DecisionTreeRegressor Accuracy: 0.5798649236683693
Ridge Accuracy: 0.5903908741165337


  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  model = cd_fast.enet_coordinate_descent(


Lasso Accuracy: 0.5907440286353451
SVR Accuracy: 0.1561416646647359
RandomForestRegressor Accuracy: 0.586595784231161

Best model: LinearRegression with accuracy 0.5909845227454431


LinearRegression()