In [2]:
if "google.colab" in str(get_ipython()):
    !pip install mlflow

import mlflow
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


#Set up MLFlow via DAGSHub
os.environ['MLFLOW_TRACKING_USERNAME'] = '2Duffman'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'f6651f529b37bdd5aa99d6a092e7d48359374423'
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = 'Symbolic-Regression'
mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')

In [3]:
# Read the data
if "google.colab" in str(get_ipython()):
  data = pd.read_csv('drive/MyDrive/distance_8.csv')
else:
  data = pd.read_csv("distance_8.csv")

# Define the target column
target = "saldo_final_target"

# Function to check if time is present in the string
def check_time(s):
    first_colon = s.find(':')
    if first_colon == -1:
        return s + ' 00:00:00'
    else:
        return s

# Apply the function to the 'delivery_start' column
data['delivery_start'] = data['delivery_start'].apply(check_time)

# Convert 'delivery_start' to datetime
data['delivery_start'] = pd.to_datetime(data['delivery_start'], format='%Y-%m-%d %H:%M:%S')

# Create 'year' column
data['year'] = data['delivery_start'].dt.year

# Create 'day' column with day of the year
data['day'] = data['delivery_start'].dt.dayofyear

# Create 'time' column with minutes since midnight
data['time'] = data['delivery_start'].dt.hour * 60 + data['delivery_start'].dt.minute

# Create cyclic representations of 'day' and 'time'
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 365)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 365)
data['time_sin'] = np.sin(2 * np.pi * data['time'] / 1440)
data['time_cos'] = np.cos(2 * np.pi * data['time'] / 1440)

# Drop 'day' and 'time' columns
data = data.drop(['day', 'time', 'delivery_start', 'floor_day_target'], axis=1)

# Separate the features and the target variable
X = data.drop(target, axis=1)
Y = data[target]

#Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, shuffle=False)

In [4]:
# Create a StandardScaler instance
scaler = StandardScaler()
scaler_target = StandardScaler()

# Fit the scalers to the data and transform it
X_train_scaled = scaler.fit_transform(X_train)
Y_train_scaled = scaler_target.fit_transform(Y_train.values.reshape(-1, 1))

# Transform the test data
X_test_scaled = scaler.transform(X_test)
Y_test_scaled = scaler_target.transform(Y_test.values.reshape(-1, 1))

# Flatten the arrays
Y_train_scaled = Y_train_scaled.flatten()
Y_test_scaled = Y_test_scaled.flatten()

In [7]:
from sklearn.model_selection import ParameterGrid
import time

# Define the parameter grid
param_grid = {
    'C': [0.6,0.62,0.64,0.66,0.68,0.7,0.72,0.74,0.76,0.78,0.8],
    'epsilon': [1.4,1.45,1.5,1.55,1.6,1.65,1.7,1.75,1.8,2,4],
    'kernel': ['rbf'],
    'tol': [0.9],
    'max_iter': [2000]
}

# Create a DataFrame to store the results
results = []

# Create a ParameterGrid instance from the parameter grid
param_grid_instance = ParameterGrid(param_grid)
print('Checking {} parameter combinations.'.format(len(param_grid_instance)))

# Loop over the parameters
for params in param_grid_instance:
    # Create and fit the model
    start_time = time.time()
    model = SVR(**params)
    model.fit(X_train_scaled, Y_train_scaled)
    end_time = time.time()
    fitting_time = end_time - start_time
    
    # Make predictions
    Y_pred_scaled = model.predict(X_test_scaled)
    
    # Transform the predictions back to the original space
    Y_pred = scaler_target.inverse_transform(Y_pred_scaled.reshape(-1, 1))

    # Compute the score
    score = mean_squared_error(Y_test, Y_pred, squared=False)

    with mlflow.start_run():
        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_param('scaled', True)
        mlflow.log_metric('rmse', score)
        mlflow.log_metric('fitting_time', fitting_time)
    
    # Append the results to the DataFrame
    results.append({**params, 'score': score})
    
    # Convert the list to a DataFrame
    results_df = pd.DataFrame(results)

    # Save the results to a CSV file
    results_df.to_csv('grid_search_results.csv', index=False)

Checking 2 parameter combinations.


