In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


In [11]:

df = pd.read_csv("train.csv")
df

Unnamed: 0,train_id,timestamp,latitude,longitude,speed,signal_distance,train_length,train_speed_limit,distance_to_next_train,brake_applied,time_to_next_signal,signal_visible,signal_status,direction,track_id,weather_condition,anomaly
0,0,2023-01-01 00:00:00,21.872701,75.925665,66.830261,2.704525,404,80,0.751705,0,2,1,RED,N,A,rain,none
1,1,2023-01-01 01:00:00,24.753572,77.709505,67.596795,3.482149,106,60,1.641089,0,7,1,GREEN,E,C,fog,none
2,2,2023-01-01 02:00:00,23.659970,79.364729,76.602817,3.510730,465,60,0.014284,1,2,1,GREEN,E,B,fog,track_conflict
3,3,2023-01-01 03:00:00,22.993292,78.661124,85.510483,0.858470,1757,120,0.913461,1,1,0,YELLOW,S,C,fog,none
4,4,2023-01-01 04:00:00,20.780093,79.032806,93.703769,2.500564,622,120,0.658461,0,7,0,GREEN,E,C,rain,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2023-02-11 11:00:00,20.457910,78.284776,71.528390,3.832637,1251,100,0.905509,1,8,0,YELLOW,E,A,fog,none
996,996,2023-02-11 12:00:00,24.586568,79.783073,95.320749,3.510537,631,60,1.671080,1,3,1,YELLOW,W,A,fog,overspeed
997,997,2023-02-11 13:00:00,20.684093,75.344790,111.326642,1.656761,1534,120,1.923226,1,7,0,RED,S,A,rain,none
998,998,2023-02-11 14:00:00,24.751187,75.285274,56.210250,3.378315,939,60,1.550164,0,2,0,YELLOW,E,C,rain,none


In [3]:
X = df.drop("anomaly", axis=1)
y = df["anomaly"]

In [4]:
X = X.drop(columns=["train_id", "timestamp", "latitude", "longitude"])

In [5]:
categorical = ["signal_status", "direction", "track_id", "weather_condition"]
numerical = [col for col in X.columns if col not in categorical]

In [6]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical)
], remainder="passthrough")


In [7]:
X

Unnamed: 0,speed,signal_distance,train_length,train_speed_limit,distance_to_next_train,brake_applied,time_to_next_signal,signal_visible,signal_status,direction,track_id,weather_condition
0,68.299473,2.556419,973,40,0.298496,0,1.538540,1,YELLOW,N,A,storm
1,78.658785,2.377995,1346,60,0.492183,0,7.120804,0,YELLOW,N,D,rain
2,53.781117,0.845134,471,40,1.308826,0,6.706579,0,YELLOW,W,A,clear
3,45.431326,4.978277,869,100,0.714574,0,4.975193,0,GREEN,E,B,clear
4,64.744021,4.759222,509,80,1.496192,1,4.643260,1,GREEN,S,A,clear
...,...,...,...,...,...,...,...,...,...,...,...,...
995,81.628222,0.758102,317,40,1.615466,1,2.415200,0,RED,S,A,rain
996,42.197708,0.617569,802,40,1.707755,0,1.328263,1,YELLOW,S,B,clear
997,56.516340,1.847549,335,60,0.585314,1,5.061723,1,RED,S,B,storm
998,66.544351,0.091636,1663,60,0.373674,0,4.428866,0,YELLOW,N,C,clear


In [8]:
label_map = {
    "none": 0,
    "signal_violation": 1,
    "track_conflict": 2,
    "overspeed": 3 
}
y = df["anomaly"].map(label_map) 

In [9]:
y

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: anomaly, Length: 1000, dtype: int64

In [10]:
from sklearn.utils import resample
import numpy as np
import pandas as pd

# Convert training data to DataFrame for processing
train_df = pd.DataFrame(X_train_encoded)
train_df['label'] = y_train.values

# Split into normal and anomaly
train_normal = train_df[train_df['label'] == 0]
train_anomaly = train_df[train_df['label'] != 0]

# Upsample anomaly to match normal
train_anomaly_upsampled = resample(train_anomaly,
                                   replace=True,
                                   n_samples=len(train_normal),
                                   random_state=42)

# Combine and shuffle
train_balanced = pd.concat([train_normal, train_anomaly_upsampled])
train_balanced = train_balanced.sample(frac=1, random_state=42)

# Extract X and y
X_train_resampled = train_balanced.drop("label", axis=1).values
y_train_resampled = train_balanced["label"].values

# ==============================
# Upsampling TEST set as well (optional & not recommended)
# ==============================

# Convert test data to DataFrame
test_df = pd.DataFrame(X_test_encoded)
test_df['label'] = y_test.values

# Split into normal and anomaly
test_normal = test_df[test_df['label'] == 0]
test_anomaly = test_df[test_df['label'] != 0]

# Upsample anomaly to match normal
test_anomaly_upsampled = resample(test_anomaly,
                                  replace=True,
                                  n_samples=len(test_normal),
                                  random_state=42)

# Combine and shuffle
test_balanced = pd.concat([test_normal, test_anomaly_upsampled])
test_balanced = test_balanced.sample(frac=1, random_state=42)

# Extract X and y
X_test_resampled = test_balanced.drop("label", axis=1).values
y_test_resampled = test_balanced["label"].values

# ==============================
# Check distributions (optional)
# ==============================
print("Training set class balance:\n", pd.Series(y_train_resampled).value_counts(normalize=True))
print("Test set class balance:\n", pd.Series(y_test_resampled).value_counts(normalize=True))


NameError: name 'X_train_encoded' is not defined

In [None]:
X_train_encoded= preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

In [None]:
from scipy.sparse import issparse
if issparse(X_train_encoded):
    X_train_encoded = X_train_encoded.toarray()
    X_test_encoded = X_test_encoded.toarray()

In [None]:
print(y.value_counts())
print(y.value_counts(normalize=True))  # percentage

anomaly
0    920
1     40
2     30
3     10
Name: count, dtype: int64
anomaly
0    0.92
1    0.04
2    0.03
3    0.01
Name: proportion, dtype: float64


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Apply SMOTE on training set only
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

In [None]:
models={
    "Decision Tree":DecisionTreeClassifier(class_weight="balanced"),
    "Random Forest":RandomForestClassifier(class_weight="balanced"),
    "Adaboost" :AdaBoostClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "XGB":XGBClassifier()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train_resampled, y_train_resampled)
    y_train_pred=model.predict(X_train_encoded)
    y_test_pred=model.predict(X_test_encoded)
    ## training test
    model_train_accuracy=accuracy_score(y_train, y_train_pred)
    model_train_f1=f1_score(y_train, y_train_pred,average="weighted")
    model_train_precision=precision_score(y_train, y_train_pred,average="weighted")
    model_train_recall=recall_score(y_train, y_train_pred,average="weighted")

    ##test performance
    model_test_accuracy=accuracy_score(y_test, y_test_pred)
    model_test_f1=f1_score(y_test, y_test_pred,average="weighted")
    model_test_precision=precision_score(y_test, y_test_pred,average="weighted")
    model_test_recall=recall_score(y_test, y_test_pred,average="weighted")
    
    print(list(models.keys())[i])
    print ("Model performance for training set data")
    print ("Accuracy: ", model_train_accuracy)
    print ("F1 Score: ", model_train_f1)
    print ("Precision: ", model_train_precision)
    print ("Recall: ", model_train_recall)
   
    print("---------------------------------------")
    print ("Model performance for test set data")
    print ("Accuracy: ", model_test_accuracy)
    print ("F1 Score: ", model_test_f1)
    print ("Precision: ", model_test_precision)
    print ("Recall: ", model_test_recall)
  
    print("---------------------------------------")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision Tree
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8938190213403938
Precision:  0.8912130541871922
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.91
F1 Score:  0.881413612565445
Precision:  0.8545685279187818
Recall:  0.91
---------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8938190213403938
Precision:  0.8912130541871922
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.91
F1 Score:  0.881413612565445
Precision:  0.8545685279187818
Recall:  0.91
---------------------------------------
Adaboost
Model performance for training set data
Accuracy:  0.37375
F1 Score:  0.49714206143199546
Precision:  0.8331559429003786
Recall:  0.37375
---------------------------------------
Model performance for test set data
Accuracy:  0.39
F1 Score:  0.5345173745173745
Precision:  0.8734236226276026
Recall:  0.39
---------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gradient Boost
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8938190213403938
Precision:  0.8912130541871922
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.91
F1 Score:  0.881413612565445
Precision:  0.8545685279187818
Recall:  0.91
---------------------------------------
XGB
Model performance for training set data
Accuracy:  0.9175
F1 Score:  0.8955738865507793
Precision:  0.8900514907627582
Recall:  0.9175
---------------------------------------
Model performance for test set data
Accuracy:  0.895
F1 Score:  0.8737467018469657
Precision:  0.853479381443299
Recall:  0.895
---------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],  # instead of 5 values
    'max_depth': [5, 10],        # instead of many
}


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gb = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='f1_weighted'  # to handle imbalanced multi-class
)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Train the grid search model on resampled (balanced) data
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict on the actual test set (original, not resampled)
y_test_pred = best_model.predict(X_test_encoded)

# Evaluate
print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test F1 Score:", f1_score(y_test, y_test_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_test_pred, zero_division=0))


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters: {'max_depth': 5, 'n_estimators': 200}
Test Accuracy: 0.895
Test F1 Score: 0.8737467018469657
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94       185
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         4

    accuracy                           0.90       200
   macro avg       0.23      0.24      0.24       200
weighted avg       0.85      0.90      0.87       200



In [None]:
import pandas as pd

rare_input = pd.DataFrame([{
    "speed": 180,                       # Excessively high speed
    "signal_distance": 0.5,            # Extremely close signal
    "train_length": 3000,              # Very long train
    "train_speed_limit": 100,          # Below current speed
    "distance_to_next_train": 0.05,     # Dangerously close
    "brake_applied": 0,                # No braking
    "time_to_next_signal": 1,          # Very little time
    "signal_visible": 0,               # Signal not visible
    "signal_status": "RED",            # Worst-case signal
    "direction": "N",             
    "track_id": "A",                  # Rare/unusual track ID
    "weather_condition": "rain"      # Worst-case weather
}])


In [None]:
rare_encoded = preprocessor.transform(rare_input)
if issparse(rare_encoded):
    rare_encoded = rare_encoded.toarray()

predicted_class = best_model.predict(rare_encoded)

print("Predicted Anomaly Class:", predicted_class)

Predicted Anomaly Class: [0]
