In [21]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [4]:
#Reading the dataset
df = pd.read_csv("price_data.csv")
df.head(5)

Unnamed: 0,baseFare,reservationCharge,superfastCharge,fuelAmount,totalConcession,tatkalFare,serviceTax,otherCharge,cateringCharge,dynamicFare,totalFare,availability,trainNumber,timeStamp,fromStnCode,toStnCode,classCode,distance,duration
0,1059,60,0,0.0,0,0,56.0,0,0,0,1175,"[{'date': '2-12-2023', 'status': 'AVAILABLE-00...",11464,2023-10-03 22:13:07.781307,JBP,SRID,1A,54,33.0
1,626,50,0,0.0,0,0,34.0,0,0,0,710,"[{'date': '2-12-2023', 'status': 'AVAILABLE-00...",11464,2023-10-03 22:13:07.781307,JBP,SRID,2A,54,33.0
2,441,40,0,0.0,0,0,24.0,0,0,0,505,"[{'date': '2-12-2023', 'status': 'AVAILABLE-01...",11464,2023-10-03 22:13:07.781307,JBP,SRID,3A,54,33.0
3,125,20,0,0.0,0,0,0.0,0,0,0,145,"[{'date': '2-12-2023', 'status': 'AVAILABLE-00...",11464,2023-10-03 22:13:07.781307,JBP,SRID,SL,54,33.0
4,1059,60,0,0.0,0,0,56.0,0,0,0,1175,"[{'date': '2-12-2023', 'status': 'AVAILABLE-00...",11464,2023-10-03 22:13:07.781307,JBP,KKB,1A,69,49.0


In [6]:
# Few of the charges were found to be redundent
df.drop(['fuelAmount', 'totalConcession', 'tatkalFare', 'otherCharge'], axis=1, inplace=True)

In [7]:
# Reservation charge only depends on class code
df_res = df.groupby(['classCode', 'reservationCharge'])['totalFare'].count()
lst_res = df_res.index.to_list()
df_res

classCode  reservationCharge
1A         20                     388
           60                   36763
2A         50                   83630
           60                     389
2S         15                    9398
3A         40                   88049
           50                     389
CC         40                   10136
SL         20                   97113
           40                     388
Name: totalFare, dtype: int64

In [8]:
# Superfast charge only depends on (trainNumber, classCode) pair
df_sup = df[df.superfastCharge>0].groupby(['trainNumber','classCode', 'superfastCharge'])['totalFare'].count()
lst_sup = df_sup.index.to_list()
df_sup

trainNumber  classCode  superfastCharge
1127         2A         45                  41
             3A         45                  41
             SL         30                  41
1128         2A         45                  99
             3A         45                  99
                                          ... 
22644        3A         45                 455
             SL         30                 455
22645        2A         45                 272
             3A         45                 272
             SL         30                 272
Name: totalFare, Length: 1021, dtype: int64

In [9]:
# Adding independent columns to indicate if the train has catering or dynamic charges. Though we are losing some
# information but it is for model simplification
df['if_offering_catering'] = df.cateringCharge>0
df['if_dynamic_fare'] = df.dynamicFare>0

In [10]:
# subtract fixed charges and precompute them to be used later after model invocation
df['fare_to_predict'] = df.totalFare-df.reservationCharge-df.superfastCharge

In [11]:
# Removing unnecessary columns
df_now = df[['classCode', 'if_offering_catering', 'if_dynamic_fare', 'distance', 'duration']]
y = df.fare_to_predict

In [12]:
dummies = pd.get_dummies(df_now['classCode'])
merged = pd.concat([df_now, dummies], axis='columns')
X = merged.drop(['classCode'], axis='columns')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lst = X_test.index.tolist()

In [16]:
X_train.columns

Index(['if_offering_catering', 'if_dynamic_fare', 'distance', 'duration', '1A',
       '2A', '2S', '3A', 'CC', 'SL'],
      dtype='object')

In [22]:
# shuffling before building the model
idxs = X_train.index.tolist()
np.random.shuffle(idxs)
X_train = X_train.loc[idxs]
y_train = y_train.loc[idxs]

In [18]:
X_train.head(1)

Unnamed: 0,if_offering_catering,if_dynamic_fare,distance,duration,1A,2A,2S,3A,CC,SL
222453,False,False,880,957.0,False,True,False,False,False,False


In [23]:
#Model specifications
model_spec= {'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        }}

In [27]:
#Training the model
model = clf = RandomizedSearchCV(model_spec['model'], model_spec['params'], cv=3, return_train_score=False, n_iter=5)
model.fit(X_train,y_train)

In [30]:
#Testing the Model's Accuracy
from sklearn.metrics import accuracy_score,confusion_matrix
y_pred = model.predict(X_test)
# print( "Confusion Matrix \n ",accuracy = accuracy_score(y_test, y_pred)
print("Predicted : " , y_pred)

Predicted :  [ 681.64738257  693.63321458  429.65826522 ...  294.5734711    87.83629212
 1342.7058913 ]


In [32]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
print("The accuracy of our model is {}%".format(score))

The accuracy of our model is 0.9775302651738682%


In [34]:
#Saving the Model
import pickle
filename = 'GradientBoostingRegressor.pkl'
pickle.dump(model, open(filename, 'wb'))
 

In [64]:
# X_train.head(1)
y_train.head(1)

81328    4505
Name: fare_to_predict, dtype: int64

In [63]:
#Predicting for a random use case 
import numpy as np

# Create a NumPy array with the input features
input_features = np.array([False, False, 1755, 1620, True, False, False, False, False, False])

# Reshape the array to have one row and multiple columns
input_features_reshaped = input_features.reshape(1, -1)

# Use the input array for prediction
result = model.predict(input_features_reshaped)

# Print or use the result as needed
print(result)


[4521.6552083]


