This only needs to be run when initializing the notebook

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!pip install mlflow --quiet
!pip install feyn

import mlflow
import os
from getpass import getpass

os.environ['MLFLOW_TRACKING_USERNAME'] = '2Duffman'
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your DAGsHub access token: ')
os.environ['MLFLOW_TRACKING_PROJECTNAME'] = 'Symbolic-Regression'

mlflow.set_tracking_uri(f'https://dagshub.com/' + os.environ['MLFLOW_TRACKING_USERNAME']
                        + '/' + os.environ['MLFLOW_TRACKING_PROJECTNAME'] + '.mlflow')


Enter your DAGsHub access token: ··········


In [None]:
#Import the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np

This loads the data and fixes the missing 00:00:00 in the delivery_start column and then converts the column into 4 new features day_sin, day_cos, time_sin, time_cos

In [None]:

# Read the data
data = pd.read_csv("distance_8.csv")

# Define the target column
target = "saldo_final_target"

# Function to check if time is present in the string
def check_time(s):
    first_colon = s.find(':')
    if first_colon == -1:
        return s + ' 00:00:00'
    else:
        return s

# Apply the function to the 'delivery_start' column
data['delivery_start'] = data['delivery_start'].apply(check_time)

# Convert 'delivery_start' to datetime
data['delivery_start'] = pd.to_datetime(data['delivery_start'], format='%Y-%m-%d %H:%M:%S')

# Create 'year' column
data['year'] = data['delivery_start'].dt.year

# Create 'day' column with day of the year
data['day'] = data['delivery_start'].dt.dayofyear

# Create 'time' column with minutes since midnight
data['time'] = data['delivery_start'].dt.hour * 60 + data['delivery_start'].dt.minute

# Create cyclic representations of 'day' and 'time'
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 365)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 365)
data['time_sin'] = np.sin(2 * np.pi * data['time'] / 1440)
data['time_cos'] = np.cos(2 * np.pi * data['time'] / 1440)

# Drop 'day' and 'time' columns
data = data.drop(['day', 'time', 'delivery_start', 'floor_day_target'], axis=1)
data.head()

Unnamed: 0,da_price_target,holiday_or_weekend_target,max_daily_temperature_target,max_daily_temperature_week_target,max_daily_temperature_hist_target,max_daily_temperature_hist_week_target,max_daily_temperature_hist_smooth_target,min_daily_temperature_target,min_daily_temperature_week_target,min_daily_temperature_hist_target,...,mvv_duration_lead2,mvv_least_quantity_lead2,mvv_duration_lead1,mvv_least_quantity_lead1,traffic_light_intensity_lead1,year,day_sin,day_cos,time_sin,time_cos
0,1987,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.442289,0.896873
1,3020,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.5,0.866025
2,3017,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.55557,0.83147
3,2908,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.608761,0.793353
4,2390,0.5,24.51,20.85,22.09,21.66,20.98,23.55,19.98,18.9,...,0.0,0,0.0,0,0.0,2020,-0.516062,-0.856551,0.659346,0.75184


In [None]:
# Separate the features and the target variable
X = data.drop(target, axis=1)
y = data[target]

# Perform PCA on the features
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Combine the PCA-transformed features and the target variable back into a DataFrame
data_pca = pd.concat([pd.DataFrame(data=X_pca), y], axis=1)
data_pca.head()


Unnamed: 0,0,1,saldo_final_target
0,3486326.0,-7186012.0,-285675
1,4491714.0,-6797842.0,-371020
2,4708494.0,-6713760.0,-423057
3,5320637.0,-6477740.0,-127568
4,5511956.0,-6407266.0,-130131


In [None]:
# Split the data into train and test sets
train, test = train_test_split(data, test_size=0.2, shuffle=False)

# Create a SGD Classifier model
model = SGDClassifier(loss='squared_error', max_iter=1000, tol=1e-3)

# Train the model
model.fit(train.drop(target, axis=1), train[target])

# Make predictions on the test set
y_pred = model.predict(test.drop(target, axis=1))

# Calculate the accuracy of the model
accuracy = accuracy_score(test[target], y_pred)
rmse = mean_squared_error(test[target], y_pred, squared=False)

print(f'Accuracy of SGD Classifier: {accuracy}')
print(f'RMSE of SGD Classifier: {rmse}')

KeyboardInterrupt: 