# 1. Problem Information
- **Name:** [**Train delay prediction**](https://platform.olimpiada-ai.ro/en/problems/52)
- **Date:** 12/02/2026
- **Type:** Regression

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# 3. Data preparation

In [2]:
def ProcessData(df):
    df['departure_time'] = df['departure_time'].apply(lambda x : int(x.split(':')[0]))
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
ProcessData(train)
ProcessData(test)
print(train.shape)
train.head(5)

(4000, 12)


Unnamed: 0,SampleID,departure_time,distance_km,avg_speed_kmh,num_stops,weather,weekday,special_events,num_cars,ticket_price,comfort_class,delay_minutes
0,4227,2,788.12,103.94,2,sunny,Fri,0,3,61.463731,intermediate,11
1,4676,5,408.42,96.6,7,sunny,Sat,0,13,125.058439,premium,17
2,800,18,440.24,92.54,1,sunny,Sun,0,12,178.797255,standard,0
3,3671,6,345.01,104.57,5,sunny,Sat,0,11,137.304807,standard,5
4,4193,22,729.77,82.12,7,sunny,Wed,0,6,193.314124,premium,27


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SampleID        4000 non-null   int64  
 1   departure_time  4000 non-null   int64  
 2   distance_km     4000 non-null   float64
 3   avg_speed_kmh   4000 non-null   float64
 4   num_stops       4000 non-null   int64  
 5   weather         4000 non-null   object 
 6   weekday         4000 non-null   object 
 7   special_events  4000 non-null   int64  
 8   num_cars        4000 non-null   int64  
 9   ticket_price    4000 non-null   float64
 10  comfort_class   4000 non-null   object 
 11  delay_minutes   4000 non-null   int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 375.1+ KB


# 4. Models

In [4]:
transformer = make_column_transformer((OneHotEncoder(),['weather','weekday']),(OrdinalEncoder(categories=[['standard','intermediate','premium']]),['comfort_class']),remainder=StandardScaler())
pipeline = make_pipeline(transformer,LinearRegression())
pipeline

0,1,2
,steps,"[('columntransformer', ...), ('linearregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehotencoder', ...), ('ordinalencoder', ...)]"
,remainder,StandardScaler()
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['standard', 'intermediate', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [5]:
X = train.iloc[:,1:-1]
Y = train['delay_minutes']
scores = cross_val_score(pipeline,X,Y,cv=5,scoring='neg_mean_squared_error')
print(scores.mean() *-1)

23.311331538595713


In [6]:
pipeline.fit(X,Y)
predictions = pipeline.predict(test.iloc[:,1:])

# 5. Submission

In [7]:
submission = pd.DataFrame({
    "SampleID": test['SampleID'],
    "delay_minutes": predictions,
})

submission.head()

Unnamed: 0,SampleID,delay_minutes
0,1501,28.136765
1,2586,11.14187
2,2653,-1.172785
3,1055,13.477946
4,705,5.331682


In [8]:
submission.to_csv("submission.csv", index=False)