In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Load the dataset
data = pd.read_csv("merged_output.csv")

# Clean the 'EuroCost' column by removing commas and converting to numeric
data['EuroCost'] = pd.to_numeric(data['EuroCost'].replace({',': ''}, regex=True), errors='coerce')

# Drop rows where 'EuroCost' is still NaN after conversion
data_clean = data.dropna(subset=['EuroCost'])

# Select features for the prediction model
features = ['ReportYear', 'ReportMonth', 'OIPID', 'SupplierID', 'NRENID', 'Country', 'ServiceType', 'ConsumptionType']
target = 'EuroCost'

# Splitting into features (X) and target (y)
X = data_clean[features]
y = data_clean[target]

# Handling missing values and encoding categorical variables
categorical_features = ['OIPID', 'SupplierID', 'NRENID', 'Country', 'ServiceType', 'ConsumptionType']
numerical_features = ['ReportYear', 'ReportMonth']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Defining the prediction model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Calculating Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Output the RMSE
print("Root Mean Squared Error:", rmse)

  data = pd.read_csv("merged_output.csv")


Root Mean Squared Error: 23375.818357787346
