In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [193]:

df = pd.read_csv("train.csv")
df

Unnamed: 0,train_id,latitude,longitude,timestamp,speed,track_id,signal_status,direction,distance_to_next_train,time_to_next_signal,weather_condition,signal_visible,brake_applied,anomaly
0,100000,18.717071,74.181285,2025-01-01 00:00:00,20,A,GREEN,W,0.41,128,rain,0,0,signal_violation
1,100001,18.664633,74.148304,2025-01-01 00:01:00,21,B,RED,S,1.62,43,rain,0,1,overspeed
2,100002,18.796684,74.251216,2025-01-01 00:02:00,93,B,GREEN,W,1.10,71,clear,0,1,none
3,100003,18.713585,74.230131,2025-01-01 00:03:00,61,B,GREEN,N,2.34,96,fog,0,0,none
4,100004,18.541285,74.041889,2025-01-01 00:04:00,30,A,GREEN,E,4.85,161,clear,0,0,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,100995,18.884646,74.021315,2025-01-01 00:35:00,92,C,RED,N,0.25,138,storm,0,1,signal_violation
996,100996,18.823811,74.268507,2025-01-01 00:36:00,87,B,GREEN,N,3.40,132,rain,1,1,none
997,100997,18.575424,73.877044,2025-01-01 00:37:00,86,C,YELLOW,S,4.32,45,rain,1,1,none
998,100998,18.929422,73.964736,2025-01-01 00:38:00,21,A,GREEN,S,2.18,88,storm,0,1,none


In [194]:
X = df.drop("anomaly", axis=1)
y = df["anomaly"]

In [195]:
X = X.drop(columns=["train_id", "timestamp", "latitude", "longitude"])

In [196]:
categorical = ["signal_status", "direction", "track_id", "weather_condition"]
numerical = [col for col in X.columns if col not in categorical]

In [197]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical)
], remainder="passthrough")


In [198]:
X

Unnamed: 0,speed,track_id,signal_status,direction,distance_to_next_train,time_to_next_signal,weather_condition,signal_visible,brake_applied
0,20,A,GREEN,W,0.41,128,rain,0,0
1,21,B,RED,S,1.62,43,rain,0,1
2,93,B,GREEN,W,1.10,71,clear,0,1
3,61,B,GREEN,N,2.34,96,fog,0,0
4,30,A,GREEN,E,4.85,161,clear,0,0
...,...,...,...,...,...,...,...,...,...
995,92,C,RED,N,0.25,138,storm,0,1
996,87,B,GREEN,N,3.40,132,rain,1,1
997,86,C,YELLOW,S,4.32,45,rain,1,1
998,21,A,GREEN,S,2.18,88,storm,0,1


In [199]:
label_map = {
    "none": 0,
    "signal_violation": 1,
    "track_conflict": 2,
    "overspeed": 3 
}
y = df["anomaly"].map(label_map) 

In [200]:
y

0      1
1      3
2      0
3      0
4      0
      ..
995    1
996    0
997    0
998    0
999    2
Name: anomaly, Length: 1000, dtype: int64

In [201]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((800, 9), (200, 9), (800,), (200,))

In [202]:
X_train_encoded= preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [203]:
from scipy.sparse import issparse
if issparse(X_train_encoded):
    X_train_encoded = X_train_encoded.toarray()
    X_test_encoded = X_test_encoded.toarray()

In [204]:
print(y.value_counts())
print(y.value_counts(normalize=True))  # percentage

anomaly
0    920
1     40
2     30
3     10
Name: count, dtype: int64
anomaly
0    0.92
1    0.04
2    0.03
3    0.01
Name: proportion, dtype: float64


In [205]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Apply SMOTE on training set only
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

In [206]:
models={
    "Decision Tree":DecisionTreeClassifier(class_weight="balanced"),
    "Random Forest":RandomForestClassifier(class_weight="balanced"),
    "Adaboost" :AdaBoostClassifier(),
  ##"Gradient Boost":GradientBoostingClassifier(),
    "XGB":XGBClassifier()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train_resampled, y_train_resampled)
    y_train_pred=model.predict(X_train_encoded)
    y_test_pred=model.predict(X_test_encoded)
    ## training test
    model_train_accuracy=accuracy_score(y_train, y_train_pred)
    model_train_f1=f1_score(y_train, y_train_pred,average="weighted")
    model_train_precision=precision_score(y_train, y_train_pred,average="weighted")
    model_train_recall=recall_score(y_train, y_train_pred,average="weighted")

    ##test performance
    model_test_accuracy=accuracy_score(y_test, y_test_pred)
    model_test_f1=f1_score(y_test, y_test_pred,average="weighted")
    model_test_precision=precision_score(y_test, y_test_pred,average="weighted")
    model_test_recall=recall_score(y_test, y_test_pred,average="weighted")
    
    print(list(models.keys())[i])
    print ("Model performance for training set data")
    print ("Accuracy: ", model_train_accuracy)
    print ("F1 Score: ", model_train_f1)
    print ("Precision: ", model_train_precision)
    print ("Recall: ", model_train_recall)
   
    print("---------------------------------------")
    print ("Model performance for test set data")
    print ("Accuracy: ", model_test_accuracy)
    print ("F1 Score: ", model_test_f1)
    print ("Precision: ", model_test_precision)
    print ("Recall: ", model_test_recall)
  
    print("---------------------------------------")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision Tree
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8938190213403938
Precision:  0.8912130541871922
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.91
F1 Score:  0.881413612565445
Precision:  0.8545685279187818
Recall:  0.91
---------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8938190213403938
Precision:  0.8912130541871922
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.91
F1 Score:  0.881413612565445
Precision:  0.8545685279187818
Recall:  0.91
---------------------------------------
Adaboost
Model performance for training set data
Accuracy:  0.37375
F1 Score:  0.49714206143199546
Precision:  0.8331559429003786
Recall:  0.37375
---------------------------------------
Model performance for test set data
Accuracy:  0.39
F1 Score:  0.5345173745173745
Precision:  0.8734236226276026
Recall:  0.39
---------------------------------------
XGB
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8955738865507793
Precision:  0.8900514907627582
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.895
F1 Score:  0.8737467018469657
Precision:  0.853479381443299

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [207]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample'],
    'max_features': ['sqrt', 'log2']
}


In [208]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='f1_weighted'  # to handle imbalanced multi-class
)


In [209]:
# Correct this:
grid_search.fit(X_train_resampled, y_train_resampled)

# Already trained, now evaluate
best_model = grid_search.best_estimator_

y_test_pred = best_model.predict(X_test_encoded)

print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test F1 Score:", f1_score(y_test, y_test_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_test_pred, zero_division=0))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Test Accuracy: 0.91
Test F1 Score: 0.881413612565445
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95       185
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         4

    accuracy                           0.91       200
   macro avg       0.23      0.25      0.24       200
weighted avg       0.85      0.91      0.88       200



In [None]:
import pandas as pd

rare_input = pd.DataFrame([{
    "speed": 180,                       # Excessively high speed
    "signal_distance": 0.5,            # Extremely close signal
    "train_length": 3000,              # Very long train
    "train_speed_limit": 100,          # Below current speed
    "distance_to_next_train": 0.05,     # Dangerously close
    "brake_applied": 0,                # No braking
    "time_to_next_signal": 1,          # Very little time
    "signal_visible": 0,               # Signal not visible
    "signal_status": "RED",            # Worst-case signal
    "direction": "N",             
    "track_id": "A",                  # Rare/unusual track ID
    "weather_condition": "storm"      # Worst-case weather
}])


In [213]:
rare_encoded = preprocessor.transform(rare_input)
if issparse(rare_encoded):
    rare_encoded = rare_encoded.toarray()

predicted_class = best_model.predict(rare_encoded)
print("Predicted Anomaly Class:", predicted_class)

Predicted Anomaly Class: [0]
