<a href="https://colab.research.google.com/github/C-Kabilan/Train-Delay-Prediction/blob/main/Train_delay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
file_path = "/content/train delay data.csv"
df = pd.read_csv(file_path)

# Display dataset info
print("Dataset Columns:", df.columns)  # Check the actual column names
print("\nDataset Head:\n", df.head())

# Identify the actual target column name
delay_column = None
for col in df.columns:
    if 'delay' in col.lower():  # Find the correct column name
        delay_column = col
        break

if delay_column is None:
    raise ValueError("Delay column not found in the dataset. Please check the column names.")

print(f"Using '{delay_column}' as the target column.")

# Handle missing values
df = df.ffill()  # Forward fill for missing values

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)

# Apply Label Encoding to categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for future inverse transformation

# Select relevant features
features = [col for col in df.columns if col.lower() not in [delay_column.lower(), 'id', 'train_number']]

# Feature scaling
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[features])

# Convert data into sequences for LSTM
sequence_length = 10  # Use past 10 timesteps for prediction

X, y = [], []
for i in range(len(df_scaled) - sequence_length):
    X.append(df_scaled[i:i + sequence_length])
    y.append(df[delay_column].iloc[i + sequence_length])

X, y = np.array(X), np.array(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(sequence_length, X.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Plot training history
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.show()

# Predictions
y_pred = model.predict(X_test)

# Evaluate Model with Corrected MAE Calculation
mae = tf.keras.metrics.MeanAbsoluteError()
mae.update_state(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae.result().numpy())


FileNotFoundError: [Errno 2] No such file or directory: '/content/train delay data.csv'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
file_path = "/content/train delay data.csv"
df = pd.read_csv(file_path)

# Display dataset info
print("Dataset Columns:", df.columns)  # Check the actual column names
print("\nDataset Head:\n", df.head())

# Identify the actual target column name
delay_column = None
for col in df.columns:
    if 'delay' in col.lower():  # Find the correct column name
        delay_column = col
        break

if delay_column is None:
    raise ValueError("Delay column not found in the dataset. Please check the column names.")

print(f"Using '{delay_column}' as the target column.")

# Handle missing values
df = df.ffill()  # Forward fill for missing values

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)

# Apply Label Encoding to categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for future inverse transformation

# Select relevant features
features = [col for col in df.columns if col.lower() not in [delay_column.lower(), 'id', 'train_number']]

# Feature scaling
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[features])

# Convert data into sequences for LSTM
sequence_length = 10  # Use past 10 timesteps for prediction

X, y = [], []
for i in range(len(df_scaled) - sequence_length):
    X.append(df_scaled[i:i + sequence_length])
    y.append(df[delay_column].iloc[i + sequence_length])

X, y = np.array(X), np.array(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(sequence_length, X.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Plot training history (Loss Curve)
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'], label='Train Loss', linewidth=2)
plt.plot(history.history['val_loss'], label='Val Loss', linewidth=2)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid()
plt.show()

# Predictions
y_pred = model.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Plot Actual vs Predicted Delays
plt.figure(figsize=(10,5))
plt.plot(y_test[:100], label='Actual', linewidth=2, marker='o')
plt.plot(y_pred[:100], label='Predicted', linewidth=2, marker='x')
plt.xlabel('Sample Index')
plt.ylabel('Delay Time')
plt.title('Actual vs. Predicted Delays')
plt.legend()
plt.grid()
plt.show()

# Plot Error Distribution
errors = y_test - y_pred.flatten()
plt.figure(figsize=(10,5))
sns.histplot(errors, bins=25, kde=True)
plt.xlabel('Prediction Error')
plt.title('Error Distribution')
plt.grid()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/content/train delay data.csv'