In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import networkx as nx

In [None]:
df=pd.read_csv("/content/train delay data.csv")
display(df.head())

Unnamed: 0,Distance Between Stations (km),Weather Conditions,Time of Day,Train Type,Historical Delay (min),Route Congestion,Unnamed: 6
0,235,Rainy,Morning,Express,35,High,
1,260,Foggy,Afternoon,Superfast,45,Low,
2,240,Clear,Evening,Local,5,Medium,
3,265,Rainy,Night,Express,50,High,
4,245,Foggy,Morning,Superfast,60,Low,


In [None]:
if 'Unnamed: 6' in df.columns:
    df.drop(columns=['Unnamed: 6'], inplace=True)
df.drop(columns=['Time of Day'], inplace=True)
df['Historical Delay (min)'] = pd.to_numeric(df['Historical Delay (min)'], errors='coerce')
df['Historical Delay (min)'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Historical Delay (min)'].fillna(0, inplace=True)


In [None]:
categorical_features = ['Weather Conditions', 'Train Type', 'Route Congestion']
numeric_features = ['Distance Between Stations (km)']

In [None]:
#If HD is greater than 5 Mins then the train is considered delayed
df['Delayed'] = df['Historical Delay (min)'].apply(lambda x: 1 if x > 5 else 0)

In [None]:
X = df.drop(columns=['Delayed', 'Historical Delay (min)'])  # features
y = df['Delayed']

In [None]:
# Handle missing values
for col in categorical_features:
    X[col] = X[col].fillna("Unknown")

for col in numeric_features:
    X[col] = X[col].fillna(X[col].median())

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('Categorial', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('Numeric', 'passthrough', numeric_features)
    ])

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("âœ… Accuracy:", accuracy_score(y_test, y_pred))
print("\nðŸ“Š Classification Report:\n", classification_report(y_test, y_pred))

âœ… Accuracy: 0.9357638888888888

ðŸ“Š Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.77      0.80        94
           1       0.96      0.97      0.96       482

    accuracy                           0.94       576
   macro avg       0.89      0.87      0.88       576
weighted avg       0.93      0.94      0.93       576



In [None]:
'''def predict_delay(distance, weather, train_type, congestion):
    input_data = pd.DataFrame([{
        'Distance Between Stations (km)': distance,
        'Weather Conditions': weather,
        'Train Type': train_type,
        'Route Congestion': congestion
    }])

    prediction = model.predict(input_data)[0]
    return "ðŸš‚ Train will be DELAYED" if prediction == 1 else "âœ… Train will be ON TIME"

In [None]:
print(predict_delay(100, "Clear", "Express", "Medium"))
print(predict_delay(50, "Rainy", "Superfast", "Medium"))

âœ… Train will be ON TIME
ðŸš‚ Train will be DELAYED


In [None]:
#Railway Network Setup
G = nx.DiGraph()
edges = [
    ("Delhi", "Kanpur", 300),
    ("Delhi", "Agra", 180),
    ("Agra", "Kanpur", 150),
    ("Kanpur", "Allahabad", 200),
    ("Agra", "Allahabad", 280),
    ("Delhi", "Jaipur", 270),
    ("Jaipur", "Agra", 210),
    ("Allahabad", "Varanasi", 120),
    ("Kanpur", "Varanasi", 180),
]
G.add_weighted_edges_from(edges)

In [None]:
#Route or Halt Suggestion
def suggest_route_or_halt(source, destination, scheduled_time, delay_prediction, max_allowed_delay=30):
    try:
        best_time = nx.dijkstra_path_length(G, source, destination, weight="weight")
        best_path = nx.dijkstra_path(G, source, destination, weight="weight")
    except nx.NetworkXNoPath:
        return {"status": "No path available"}

    if not delay_prediction:
        return {"status": "On time", "path": best_path, "travel_time": best_time}

    # Check alternate routes
    all_paths = list(nx.all_simple_paths(G, source, destination))
    alternate = None
    for path in all_paths:
        time = sum(G[path[i]][path[i+1]]["weight"] for i in range(len(path)-1))
        if time <= scheduled_time + max_allowed_delay:
            alternate = (path, time)
            break

    if alternate:
        return {"status": "Alternate route suggested", "path": alternate[0], "travel_time": alternate[1]}

    # Suggest halt time if no better route
    delay_time = best_time - scheduled_time
    halt_time = max(0, delay_time - max_allowed_delay)
    return {"status": "Halt required", "path": best_path, "halt_time": halt_time, "travel_time": best_time}

In [None]:
def predict_train_status(train_details, source, destination, scheduled_time):
    """
    train_details: dict with the same keys as dataset features
    source, destination: stations
    scheduled_time: planned journey time (minutes)
    """
    input_df = pd.DataFrame([train_details])
    input_df = input_df[X.columns]  # match feature order
    prediction = model.predict(input_df)[0]  # 0 = on time, 1 = delayed
    delay_prediction = bool(prediction)

    suggestion = suggest_route_or_halt(source, destination, scheduled_time, delay_prediction)
    return {"delay_prediction": "Delayed" if delay_prediction else "On Time", "suggestion": suggestion}

In [None]:
test_cases = [
    # Clear, low congestion, no history of delay â†’ should be On Time
    {
        "Train Type": "Local",
        "Route": "Delhi-Agra",
        "Historical Delay (min)": 0,
        "Weather Conditions": "Clear",
        "Route Congestion": "Low",
        "Distance Between Stations (km)": 200
    },
    # Bad weather + high congestion â†’ likely Delayed
    {
        "Train Type": "Express",
        "Route": "Delhi-Varanasi",
        "Historical Delay (min)": 20,
        "Weather Conditions": "Stormy",
        "Route Congestion": "High",
        "Distance Between Stations (km)": 800
    },
    # Long distance but good conditions â†’ balanced
    {
        "Train Type": "Superfast",
        "Route": "Delhi-Mumbai",
        "Historical Delay (min)": 10,
        "Weather Conditions": "Clear",
        "Route Congestion": "Medium",
        "Distance Between Stations (km)": 1400
    },
    # Freight train, slow by design
    {
        "Train Type": "Express",
        "Route": "Delhi-Kanpur",
        "Historical Delay (min)": 50,
        "Weather Conditions": "Foggy",
        "Route Congestion": "High",
        "Distance Between Stations (km)": 500
    }
]

for case in test_cases:
    result = predict_train_status(case, "Delhi", case["Route"].split("-")[1], scheduled_time=600)
    print(case["Route"], "â†’", result)


Delhi-Agra â†’ {'delay_prediction': 'On Time', 'suggestion': {'status': 'On time', 'path': ['Delhi', 'Agra'], 'travel_time': 180}}
Delhi-Varanasi â†’ {'delay_prediction': 'Delayed', 'suggestion': {'status': 'Alternate route suggested', 'path': ['Delhi', 'Kanpur', 'Allahabad', 'Varanasi'], 'travel_time': 620}}
Delhi-Mumbai â†’ {'delay_prediction': 'Delayed', 'suggestion': {'status': 'No path available'}}
Delhi-Kanpur â†’ {'delay_prediction': 'Delayed', 'suggestion': {'status': 'Alternate route suggested', 'path': ['Delhi', 'Kanpur'], 'travel_time': 300}}
