<a href="https://colab.research.google.com/github/Anshumaan4197/datasets/blob/main/Copy_of_train_delay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Load the dataset
train = pd.read_csv('/content/drive/MyDrive/Apr22_Jun22_GQD.csv')

# Drop rows with missing values in specific columns
train.dropna(subset=['TRAIN_NUMBER', 'ID_TRAIN_DEF', 'TRAIN_TYPE', 'DIVISION_CODE', 'SERIAL_NUMBER', 'STATION_CODE',
                     'NEXT_STATION', 'MONTH', 'WEEKDAY', 'DELAY_TIME', 'ALWNC', 'DISTANCE'], inplace=True)
train.reset_index(drop=True, inplace=True)

# Create a new column for delay time minus allowance
train['DELAY_MINUS_ALWNC'] = train['DELAY_TIME'] - train['ALWNC']

# Select relevant columns and target
features = ['TRAIN_NUMBER', 'ID_TRAIN_DEF', 'TRAIN_TYPE', 'DIVISION_CODE', 'SERIAL_NUMBER', 'STATION_CODE',
            'NEXT_STATION', 'MONTH', 'WEEKDAY', 'DELAY_TIME', 'ALWNC', 'DISTANCE', 'DELAY_MINUS_ALWNC']
target = 'NEXT_DELAY'
X = train[features]
y = train[target]
# Set negative values in the NEXT_DELAY column to 0
y = y.clip(lower=0)
# Dummy encoding for categorical features
X = pd.get_dummies(X, columns=['TRAIN_TYPE', 'DIVISION_CODE', 'STATION_CODE', 'NEXT_STATION'])
# Preprocessing: StandardScaler for numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=48)
# Define the models
random_forest = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=5, random_state=48)
gradient_boosting = GradientBoostingRegressor(random_state=48)
xgb_model = xgb.XGBRegressor(n_estimators = 300, learning_rate = 0.2, max_depth = 3, random_state = 48)
# Fit the models
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
# Predictions
y_pred_rf = random_forest.predict(X_test)
y_pred_gb = gradient_boosting.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
# Ensemble predictions using simple averaging
ensemble_pred = (y_pred_rf + y_pred_gb + y_pred_xgb) / 3
# Apply constraint to ensemble predictions
ensemble_pred[y_test == 0] = 0
# Evaluate ensemble performance
ensemble_r2 = r2_score(y_test, ensemble_pred)
ensemble_mse = mean_squared_error(y_test, ensemble_pred)
ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
print("Ensemble R2 score (test set):", ensemble_r2)
print("Ensemble Mean Squared Error (test set):", ensemble_mse)
print("Ensemble Mean Absolute Error (test set):", ensemble_mae)