In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install catboost

In [6]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
file_path = '/content/drive/MyDrive/Master Paper/Data/'

df = pd.read_csv(file_path + 'eu_total_final.csv')

# Feature engineering for K6 and K2
df['mase'] = np.where(
    df['transporto_priemones_tipas'] == 'K6',
    df['maksimali_mase_kg'],
    df['nuosava_mase_kg']
)

# Group by 'transporto_priemones_tipas' and calculate the value counts of 'degalai'
counts = df.groupby('transporto_priemones_tipas')['degalai'].value_counts()

# Convert the result to a DataFrame for better readability (optional)
counts_df = counts.reset_index(name='count')

print(counts_df)


In [None]:
# Group by 'transporto_priemones_tipas' and calculate the value counts of 'degalai'
counts = df.groupby('transporto_priemones_tipas')['degalai'].value_counts()

# Convert the result to a DataFrame for better readability (optional)
counts_df = counts.reset_index(name='count')

print(counts_df)


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer

# Feature engineering for K6 and K2
dfs['mase'] = np.where(
    dfs['transporto_priemones_tipas'] == 'K6',
    dfs['maksimali_mase_kg'],
    dfs['nuosava_mase_kg']
)

# Load and preprocess data
features = ['variklio_turis_cm3', 'galia_kw', 'degalai', 'transporto_priemones_tipas', 'mase']
target = 'kuro_sunaudojimas_l100km_org'

df_filtered = df[features + [target]].copy()

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')  # For both numerical and categorical
df_filtered[features] = imputer.fit_transform(df_filtered[features])
df_filtered[target] = df_filtered[target].fillna(df_filtered[target].mean())

# Split data
X = df_filtered[features]
y = df_filtered[target]
# Combine stratification columns into a single column
X['stratify_group'] = X['transporto_priemones_tipas'].astype(str) + "_" + X['degalai'].astype(str)

# Perform stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X.drop(columns=['stratify_group']),  # Drop the stratification column from features
    y,
    test_size=0.2,
    random_state=42,
    stratify=X['stratify_group']
)

# Preprocessing: Scaling and Encoding
numeric_features = ['variklio_turis_cm3', 'galia_kw', 'mase']
categorical_features = ['degalai', 'transporto_priemones_tipas']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Define result path
result_path = '/content/drive/MyDrive/Master Paper/Results/'
os.makedirs(result_path, exist_ok=True)

# Function for evaluation metrics
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
    return mae, rmse, r2

# Hyperparameter tuning using GridSearchCV
def tune_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(
        model,
        param_grid,
        scoring='neg_mean_absolute_error',
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_



def save_model_results(model, model_name, X_test_transformed, y_test, X_test):
    """
    Evaluate the model, save evaluation metrics, and create prediction graphs.
    """
    # Predict
    y_pred = model.predict(X_test_transformed)

    # Evaluate the model
    mae, rmse, r2 = evaluate_model(model, X_test_transformed, y_test, model_name)

    # Save evaluation metrics to CSV
    metrics = {
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R²': r2
    }
    metrics_df = pd.DataFrame([metrics])
    metrics_csv_path = os.path.join(result_path, 'model_evaluation_metrics.csv')

    if os.path.exists(metrics_csv_path):
        metrics_df.to_csv(metrics_csv_path, mode='a', header=False, index=False)
    else:
        metrics_df.to_csv(metrics_csv_path, index=False)

    # Save prediction graphs for each transport type
    unique_transport_types = X_test['transporto_priemones_tipas'].unique()
    model_folder = os.path.join(result_path, model_name)
    os.makedirs(model_folder, exist_ok=True)

    for transport_type in unique_transport_types:
        transport_df = X_test[X_test['transporto_priemones_tipas'] == transport_type]
        fuel_types = transport_df['degalai'].unique()

        plt.figure(figsize=(8, 6))
        for fuel_type in fuel_types:
            fuel_df = transport_df[transport_df['degalai'] == fuel_type]

            # Map fuel_df indices to positions in y_pred
            pred_positions = fuel_df.index.map(lambda idx: X_test.index.get_loc(idx))
            y_pred_group = y_pred[pred_positions]
            y_test_group = y_test.loc[fuel_df.index]

            # Translate fuel type to English
            fuel_type_label = "Diesel" if fuel_type == "Dyzelinas" else "Petrol"

            # Plot predicted vs. actual
            plt.scatter(
                y_pred_group,
                y_test_group,
                label=f"{fuel_type_label}",
                alpha=0.5
            )

        plt.xlabel("Predicted Fuel Consumption (L/100km)")
        plt.ylabel("Actual Fuel Consumption (L/100km)")
        plt.xlim(3, 15)  # Set X-axis limits
        plt.ylim(3, 15)  # Set Y-axis limits
        plt.legend(title="Fuel Type")
        plt.grid(True)

        # Save the graph
        plt_path = os.path.join(model_folder, f"{transport_type}_predicted_vs_actual.png")
        plt.savefig(plt_path)
        plt.close()

# Linear Regression
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
lr_pipeline.fit(X_train, y_train)  # Train the model
save_model_results(lr_pipeline, "Linear Regression", X_test, y_test, X_test)

# Ridge Regression
ridge_param_grid = {'model__alpha': np.arange(0.05, 1.05, 0.05)}
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', Ridge())])
ridge_best = tune_model(ridge_pipeline, ridge_param_grid, X_train, y_train)
save_model_results(ridge_best, "Ridge Regression", X_test, y_test, X_test)

# Lasso Regression
lasso_param_grid = {'model__alpha': np.arange(0.05, 1.05, 0.05)}
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', Lasso())])
lasso_best = tune_model(lasso_pipeline, lasso_param_grid, X_train, y_train)
save_model_results(lasso_best, "Lasso Regression", X_test, y_test, X_test)

# ElasticNet Regression
elasticnet_param_grid = {
    'model__alpha': np.arange(0.05, 1.05, 0.05),
    'model__l1_ratio': np.arange(0.1, 1.1, 0.1)
}
elasticnet_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', ElasticNet(max_iter=1000))])
elasticnet_best = tune_model(elasticnet_pipeline, elasticnet_param_grid, X_train, y_train)
save_model_results(elasticnet_best, "ElasticNet Regression", X_test, y_test, X_test)

# Gradient Boosting
gbm_param_grid = {
    'model__n_estimators': [25, 100, 200],
    'model__learning_rate': [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2],
    'model__max_depth': [2, 3, 4]
}
gbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', GradientBoostingRegressor())])
gbm_best = tune_model(gbm_pipeline, gbm_param_grid, X_train, y_train)  # Tune and train
save_model_results(gbm_best, "Gradient Boosting", X_test, y_test, X_test)  # Save results

# XGBoost Manual Grid Search with Plotting
best_params = None
best_score = float('inf')

# Dictionary to store evaluation results
xgb_evaluation_metrics = []

# Preprocess the data
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Continue with XGBoost training
for n_estimators in [25, 100, 200, 400]:
    for learning_rate in [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2]:
        for max_depth in [2, 3, 4]:
            model = xgb.XGBRegressor(
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                random_state=42
            )
            model.fit(X_train_transformed, y_train)
            y_pred = model.predict(X_test_transformed)
            score = mean_absolute_error(y_test, y_pred)
            xgb_evaluation_metrics.append({
                "n_estimators": n_estimators,
                "learning_rate": learning_rate,
                "max_depth": max_depth,
                "MAE": score
            })
            if score < best_score:
                best_score = score
                best_params = {
                    "n_estimators": n_estimators,
                    "learning_rate": learning_rate,
                    "max_depth": max_depth,
                }

print("Best XGBoost Params:", best_params)

# Train the Best XGBoost Model
best_xgb_model = xgb.XGBRegressor(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    random_state=42
)
best_xgb_model.fit(X_train_transformed, y_train)
save_model_results(best_xgb_model, "XGBoost", X_test_transformed, y_test, X_test)


# LightGBM
lgb_param_grid = {
    'model__n_estimators': [25, 100, 200, 400, 1000],
    'model__learning_rate': [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1],
    'model__max_depth': [2, 3, 4, 5]
}
lgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LGBMRegressor())])
lgb_best = tune_model(lgb_pipeline, lgb_param_grid, X_train, y_train)
save_model_results(lgb_best, "LightGBM", X_test, y_test, X_test)

# CatBoost
catboost_param_grid = {
    'model__iterations': [50, 100, 200, 400, 1000],
    'model__depth': [2, 3, 4, 6],
    'model__learning_rate': [ 0.05, 0.1, 0.2, 0.5, 1,],
}

catboost_model = CatBoostRegressor(random_state=42, silent=True)
catboost_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', catboost_model)])
catboost_best = tune_model(catboost_pipeline, catboost_param_grid, X_train, y_train)
save_model_results(catboost_best, "CatBoost", X_test, y_test, X_test)

# Support Vector Machine Regressor (SVR)
svr_param_grid = {
    'model__C': [10, 100, 200],
    'model__epsilon': [ 0.1, 0.2, 0.5]
}

svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', SVR(kernel='rbf'))])
svr_best = tune_model(svr_pipeline, svr_param_grid, X_train, y_train)
save_model_results(svr_best,  "SVR", X_test, y_test, X_test)  # Save results

# Define the parameter grid for Random Forest
rf_param_grid = {
    'model__n_estimators': [200, 400],        # Number of trees in the forest
    'model__max_depth': [None, 10, 20],          # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10, 20],      # Minimum number of samples required to split an internal node
    'model__min_samples_leaf': [1, 2, 4, 10]         # Minimum number of samples required to be at a leaf node
}


# Create a Random Forest pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestRegressor(random_state=42))])

# Tune the Random Forest model using the parameter grid
rf_best = tune_model(rf_pipeline, rf_param_grid, X_train, y_train)

# Save the results of the best model
save_model_results(rf_best, "Random Forest", X_test, y_test, X_test)

