This only needs to be run when initializing the notebook

In [2]:
!pip install mlflow --quiet
!pip install feyn

import mlflow
import os
from getpass import getpass

os.environ['MLFLOW_TRACKING_USERNAME'] = '2Duffman'
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = 'Symbolic-Regression'

mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')




In [3]:
#Import the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np

This loads the data and fixes the missing 00:00:00 in the delivery_start column and then converts the column into 4 new features day_sin, day_cos, time_sin, time_cos

In [4]:

# Read the data
data = pd.read_csv("distance_8.csv")

# Define the target column
target = "saldo_final_target"

# Function to check if time is present in the string
def check_time(s):
    first_colon = s.find(':')
    if first_colon == -1:
        return s + ' 00:00:00'
    else:
        return s

# Apply the function to the 'delivery_start' column
data['delivery_start'] = data['delivery_start'].apply(check_time)

# Convert 'delivery_start' to datetime
data['delivery_start'] = pd.to_datetime(data['delivery_start'], format='%Y-%m-%d %H:%M:%S')

# Create 'year' column
data['year'] = data['delivery_start'].dt.year

# Create 'day' column with day of the year
data['day'] = data['delivery_start'].dt.dayofyear

# Create 'time' column with minutes since midnight
data['time'] = data['delivery_start'].dt.hour * 60 + data['delivery_start'].dt.minute

# Create cyclic representations of 'day' and 'time'
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 365)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 365)
data['time_sin'] = np.sin(2 * np.pi * data['time'] / 1440)
data['time_cos'] = np.cos(2 * np.pi * data['time'] / 1440)

# Drop 'day' and 'time' columns
data = data.drop(['day', 'time', 'delivery_start', 'floor_day_target'], axis=1)
data.head()

Unnamed: 0,da_price_target,holiday_or_weekend_target,max_daily_temperature_target,max_daily_temperature_week_target,max_daily_temperature_hist_target,max_daily_temperature_hist_week_target,max_daily_temperature_hist_smooth_target,min_daily_temperature_target,min_daily_temperature_week_target,min_daily_temperature_hist_target,...,mvv_duration_lead2,mvv_least_quantity_lead2,mvv_duration_lead1,mvv_least_quantity_lead1,traffic_light_intensity_lead1,year,day_sin,day_cos,time_sin,time_cos
0,1987,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.442289,0.896873
1,3020,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.5,0.866025
2,3017,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.55557,0.83147
3,2908,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.608761,0.793353
4,2390,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.659346,0.75184


In [17]:
# Separate the features and the target variable
X = data.drop(target, axis=1)
y = data[target]

# Perform PCA on the features
n_components = 'mle'
pca = PCA(n_components=n_components) 
X_pca = pca.fit_transform(X)

# Combine the PCA-transformed features and the target variable back into a DataFrame
data_pca = pd.concat([pd.DataFrame(data=X_pca), y], axis=1)
data_pca.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,113,114,115,116,117,118,119,120,121,saldo_final_target
0,3486326.0,-7186012.0,-147283.596621,1865485.0,-432081.0,-342799.878659,154871.23537,-182653.801273,983521.3,82632.029323,...,0.004816,-0.002424,-0.01338,-0.02466,0.007765,0.004544,0.001822,-0.002732,0.000366,-285675
1,4491714.0,-6797842.0,-112784.343729,1329884.0,-678628.9,-50799.243445,162126.91069,-186962.049484,810972.3,266867.63983,...,-0.000572,-0.001315,0.004324,-0.000907,-0.000476,-0.000358,-0.000618,-0.000131,1.2e-05,-371020
2,4708494.0,-6713760.0,-105915.219489,1170155.0,-719524.9,72384.419048,203495.433119,-241869.317212,1168662.0,213769.703643,...,-0.000343,-0.000489,0.004458,-0.000913,-0.000577,-0.000436,-0.000672,-0.000109,2.2e-05,-423057
3,5320637.0,-6477740.0,-86412.752391,959570.2,-1085037.0,233875.814855,196543.665461,-213622.019401,890092.8,388195.043558,...,0.000161,-0.000593,0.004372,-0.000765,-0.000592,-0.000403,-0.000744,-7.4e-05,1.8e-05,-127568
4,5511956.0,-6407266.0,-71835.177867,734556.3,-1156317.0,285642.110545,124767.909484,-138603.474949,852538.7,-222147.072131,...,0.000356,-0.00152,0.004248,-0.000793,-0.000618,-0.000484,-0.000749,-5.9e-05,1.7e-05,-130131


In [18]:
from sklearn.linear_model import LinearRegression
# Split the data into train and test sets
train, test = train_test_split(data_pca, test_size=0.2, shuffle=False)

# Create a Lienar Regressor model
model_name = "Linear Regressor on PCA data"
model = LinearRegression()

# Train the model
model.fit(train.drop(target, axis=1), train[target])

# Make predictions on the test set
y_pred = model.predict(test.drop(target, axis=1))

# Calculate the accuracy of the model
rmse = mean_squared_error(test[target], y_pred, squared=False)
r_squared = model.score(test.drop(target, axis=1), test[target])


with mlflow.start_run():
    mlflow.log_param('PCA_components', n_components)
    #mlflow.log_param('max_iter', max_iter)
    #mlflow.log_param('tol', tol)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r_squared', r_squared)
    mlflow.sklearn.log_model(model, model_name)
print(f'RMSE of Linear Regressor: {rmse}')




RMSE of Linear Regressor: 452170.376889089
