<a href="https://colab.research.google.com/github/Eyad-dev/UK_Train_Rides/blob/main/(ML)_UK_Train_Rides.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing libraries for predictions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import joblib

In [None]:
df = pd.read_csv("Depi project railway (1).csv")

In [None]:
df

Unnamed: 0,Transaction ID,Date of Purchase,Time of Purchase,Purchase Type,Payment Method,Railcard,Ticket Class,Price,Departure Station,Arrival Destination,...,Delay in Minutes,Refund Amount,Revenue,Days Between Purchase and Journey,Is Delayed,Month Name,Day Name,Is Weekend,Day Journey Period,Day Purchase Period
0,da8a6ba8-b3dc-4677-b176,08-12-23,12:41:11 PM,Online,Contactless,Adult,Standard,43,London Paddington,Liverpool Lime Street,...,0,0,43,24,No,January,Monday,No,Morning,Evening
1,b0cdd1b0-f214-4197-be53,16-12-23,11:23:01 AM,Station,Credit Card,Adult,Standard,23,London Kings Cross,York,...,5,0,23,16,Yes,January,Monday,No,Morning,Morning
2,f3ba7a96-f713-40d9-9629,19-12-23,7:51:27 PM,Online,Credit Card,No Railcard,Standard,3,Liverpool Lime Street,Manchester Piccadilly,...,0,0,3,14,No,January,Tuesday,No,Night,Night
3,b2471f11-4fe7-4c87-8ab4,20-12-23,11:00:36 PM,Station,Credit Card,No Railcard,Standard,13,London Paddington,Reading,...,0,0,13,12,No,January,Monday,No,Night,Night
4,2be00b45-0762-485e-a7a3,27-12-23,6:22:56 PM,Online,Contactless,No Railcard,Standard,76,Liverpool Lime Street,London Euston,...,0,0,76,5,No,January,Monday,No,Evening,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31648,1304623d-b8b7-4999-8e9c,30-04-24,6:42:58 PM,Online,Credit Card,No Railcard,Standard,4,Manchester Piccadilly,Liverpool Lime Street,...,0,0,4,0,No,April,Tuesday,No,Night,Night
31649,7da22246-f480-417c-bc2f,30-04-24,6:46:10 PM,Online,Contactless,No Railcard,Standard,10,London Euston,Birmingham New Street,...,0,0,10,0,No,April,Tuesday,No,Night,Night
31650,add9debf-46c1-4c75-b52d,30-04-24,6:56:41 PM,Station,Credit Card,No Railcard,Standard,4,Manchester Piccadilly,Liverpool Lime Street,...,0,0,4,0,No,April,Tuesday,No,Night,Night
31651,b92b047c-21fd-4859-966a,30-04-24,7:51:47 PM,Station,Credit Card,No Railcard,Standard,10,London Euston,Birmingham New Street,...,0,0,10,0,No,April,Tuesday,No,Night,Night


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Transaction ID                     31653 non-null  object
 1   Date of Purchase                   31653 non-null  object
 2   Time of Purchase                   31653 non-null  object
 3   Purchase Type                      31653 non-null  object
 4   Payment Method                     31653 non-null  object
 5   Railcard                           31653 non-null  object
 6   Ticket Class                       31653 non-null  object
 7   Price                              31653 non-null  int64 
 8   Departure Station                  31653 non-null  object
 9   Arrival Destination                31653 non-null  object
 10  Date of Journey                    31653 non-null  object
 11  Departure Time                     31653 non-null  object
 12  Arri

In [None]:
#Columns that will be needed to carry out the prediction (Predictors)
features = [
    "Departure Station", "Arrival Destination", "Day Journey Period", "Day Name",
    "Is Weekend", "Ticket Class", "Payment Method", "Days Between Purchase and Journey", "Price"
]
#Column to make predictions on
target_col = "Is Delayed"

In [None]:
#Copies the columns in features into x
x = df[features].copy()
#Since the column we will be predicting only consist of bool values (Yes or no)
#We will map them as 1 for "Yes" and 0 for "No"
#Such procedure makes it safier for the model to carry out tasks without throwing errors
y = df[target_col].map({"No":0,"Yes":1})

In [None]:
#Due to how large the amount of stations it could exist
#We choose the top 20 most common station ridden and any other
#station would be classified as "Other"
#This helps the pipeline be faster due to less amount of stations to process
for col in ["Departure Station", "Arrival Destination"]:
  top_vals = x[col].value_counts().nlargest(20).index
  x[col] = x[col].where(x[col].isin(top_vals), other="Other")

In [None]:
categorical_cols = ["Departure Station", "Arrival Destination", "Day Journey Period", "Is Weekend", "Ticket Class", "Payment Method"]
numeric_cols = ["Days Between Purchase and Journey", "Price"]
#Differntiating numeric columns and categorical colmuns

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
    ("num", StandardScaler(), numeric_cols)
])

In [None]:
df_sample = df.sample(frac=0.2, random_state=42)
x_sample = df_sample[features].copy()
y_sample = df_sample[target_col].map({"No": 0, "Yes": 1})

In [None]:
for col in ["Departure Station", "Arrival Destination"]:
  top_vals = x_sample[col].value_counts().nlargest(20).index
  x_sample[col] = x_sample[col].where(x[col].isin(top_vals), other="Other")

In [None]:
pipe_knn = Pipeline(steps=[("pre", preprocessor), ("clf", KNeighborsClassifier(n_neighbors=5))])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
pipe_knn.fit(x_train, y_train)
y_pred = pipe_knn.predict(x_test)
print("=== K-Nearest Neighbors (on 20% sample) ===")
print(classification_report(y_test, y_pred, zero_division=0))

=== K-Nearest Neighbors (on 20% sample) ===
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5873
           1       0.82      0.59      0.69       458

    accuracy                           0.96      6331
   macro avg       0.90      0.79      0.83      6331
weighted avg       0.96      0.96      0.96      6331



In [None]:
print("\nSample of raw features (first 6 rows):")
print(x.head(6).to_string(index=False))


Sample of raw features (first 6 rows):
    Departure Station   Arrival Destination Day Journey Period Day Name Is Weekend Ticket Class Payment Method  Days Between Purchase and Journey  Price
    London Paddington Liverpool Lime Street            Morning   Monday         No     Standard    Contactless                                 24     43
   London Kings Cross                  York            Morning   Monday         No     Standard    Credit Card                                 16     23
Liverpool Lime Street Manchester Piccadilly              Night  Tuesday         No     Standard    Credit Card                                 14      3
    London Paddington               Reading              Night   Monday         No     Standard    Credit Card                                 12     13
Liverpool Lime Street         London Euston            Evening   Monday         No     Standard    Contactless                                  5     76
   London Kings Cross                  Yor

In [None]:
try:
  model = joblib.load("model.pkl")
  print("Loaded")
except FileNotFoundError:
  model = RandomForestClassifier()
  print("Created new Model")

Created new Model


In [None]:
pipe_rf = Pipeline(steps=[
    ("pre", preprocessor),
    ("clf", RandomForestClassifier())
])

pipe_rf.fit(x_train,y_train)

In [None]:
joblib.dump(model, "model.pkl")

['model.pkl']

In [None]:
joblib.load("model.pkl")