In [1]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("bhargavdvs2001/Group12_Prog_DB_project")

# Upload file
s3.upload_file(
    Bucket="Group12_Prog_DB_project",  # name of the repo
    Filename="C:/UB/EAS503-2/twitchdata-update.csv",  # local path of file to upload
    Key="remote.csv",  # remote path where to upload the file
)


In [2]:
import os
import sqlite3
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
import seaborn as sns
import tk
import PyQt5
import matplotlib
matplotlib.use('TkAgg')  # or 'Qt5Agg', 'WebAgg'
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
encoder = OneHotEncoder(handle_unknown = "ignore")
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lazypredict.Supervised import LazyRegressor
from lightgbm import LGBMRegressor 
import joblib
import mlflow
import dagshub



In [3]:
# Connect to SQLite database
conn = sqlite3.connect('twitch_data.db')
cursor = conn.cursor()

# Create tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS Channels (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    Channel TEXT NOT NULL,
    Language TEXT,
    Partnered BOOLEAN,
    Mature BOOLEAN
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS ChannelStats (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    channel_id INTEGER,
    WatchTimeMinutes INTEGER,
    StreamTimeMinutes INTEGER,
    PeakViewers INTEGER,
    AverageViewers INTEGER,
    Followers INTEGER,
    FollowersGained INTEGER,
    ViewsGained INTEGER,
    FOREIGN KEY (channel_id) REFERENCES Channels(id)
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS FollowersParam (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    Channel TEXT NOT NULL,
    Language TEXT,
    WatchTimeMinutes INTEGER,
    StreamTimeMinutes INTEGER,
    ViewsGained INTEGER,
    Mature BOOLEAN
)
''')

conn.commit()


In [4]:
# Read data from CSV and insert into tables
with open('C:/UB/EAS503-2/twitchdata-update.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Insert into Channels table
        cursor.execute('''
        INSERT INTO Channels (Channel, Language, Partnered, Mature)
        VALUES (?, ?, ?, ?)
        ''', (row['Channel'], row['Language'], row['Partnered'], row['Mature']))
        
        # Get the last inserted channel_id
        channel_id = cursor.lastrowid
        
        # Insert into ChannelStats table
        cursor.execute('''
        INSERT INTO ChannelStats (channel_id, WatchTimeMinutes, StreamTimeMinutes, PeakViewers, AverageViewers, Followers, FollowersGained, ViewsGained)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (channel_id, row['Watch time(Minutes)'], row['Stream time(minutes)'], row['Peak viewers'], row['Average viewers'], row['Followers'], row['Followers Gained'], row['Views gained']))
        
        #Insert into FollowersParam
        cursor.execute('''
        INSERT INTO FollowersParam (Channel, Language, WatchTimeMinutes, StreamTimeMinutes, ViewsGained, Mature)
        VALUES (?, ?, ?, ?, ?, ?)
        ''', (row['Channel'], row['Language'], row['Watch time(Minutes)'], row['Stream time(minutes)'], row['Views gained'], row['Mature']))
conn.commit()
conn.close()

In [5]:

# Connect to SQLite database
conn = sqlite3.connect('twitch_data.db')

# SQL query to join tables
query = '''
SELECT
    Channels.Channel,
    Channels.Language,
    Channels.Partnered,
    Channels.Mature,
    ChannelStats.WatchTimeMinutes,
    ChannelStats.StreamTimeMinutes,
    ChannelStats.PeakViewers,
    ChannelStats.AverageViewers,
    ChannelStats.Followers,
    ChannelStats.FollowersGained,
    ChannelStats.ViewsGained
FROM
    Channels
JOIN
    ChannelStats
ON
    Channels.id = ChannelStats.channel_id
'''

# Load data into Pandas DataFrame
df = pd.read_sql_query(query, conn)
conn.close()

# Display first few rows of the DataFrame
df.head()


Unnamed: 0,Channel,Language,Partnered,Mature,WatchTimeMinutes,StreamTimeMinutes,PeakViewers,AverageViewers,Followers,FollowersGained,ViewsGained
0,xQcOW,English,True,False,6196161750,215250,222720,27716,3246298,1734810,93036735
1,summit1g,English,True,False,6091677300,211845,310998,25610,5310163,1370184,89705964
2,Gaules,Portuguese,True,True,5644590915,515280,387315,10976,1767635,1023779,102611607
3,ESL_CSGO,English,True,False,3970318140,517740,300575,7714,3944850,703986,106546942
4,Tfue,English,True,False,3671000070,123660,285644,29602,8938903,2068424,78998587


In [6]:
# Check for class imbalance
print(df['Partnered'].value_counts())

# Perform train/test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Partnered'], random_state=42)

# Verify stratification
print(train_df['Partnered'].value_counts(normalize=True))
print(test_df['Partnered'].value_counts(normalize=True))
print(train_df)

Partnered
True     15650
False      352
Name: count, dtype: int64
Partnered
True    0.98
False   0.02
Name: proportion, dtype: float64
Partnered
True    0.98
False   0.02
Name: proportion, dtype: float64
             Channel    Language Partnered Mature  WatchTimeMinutes  \
4102          Baiano  Portuguese      True  False         859718520   
11865        Arcadum     English      True  False         142213980   
15041     dota2mc_ru     Russian      True  False        1464683175   
12560         qSnake     Russian      True  False         212541900   
1710         venruki     English      True  False         170464680   
...              ...         ...       ...    ...               ...   
4039        nl_Kripp     English      True  False        1470897720   
14133  allkeyshop_tv     English      True  False         663185955   
1295        Homyatol     Italian      True   True         378734925   
315      Distortion2     English      True   True         354144315   
14791   강퀴 (kan

In [7]:

# Generate profile report
profile = ProfileReport(train_df, title="Training Data Report", explorative=True)
profile.to_notebook_iframe()


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# Categorize data
categorical_features = df.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Identify null values
null_values = train_df.isnull().sum()

# Check data types
data_types = train_df.dtypes

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)
print("Null Values:", null_values)
print("Data Types:", data_types)


Categorical Features: ['Channel', 'Language', 'Partnered', 'Mature']
Numerical Features: ['WatchTimeMinutes', 'StreamTimeMinutes', 'PeakViewers', 'AverageViewers', 'Followers', 'FollowersGained', 'ViewsGained']
Null Values: Channel              0
Language             0
Partnered            0
Mature               0
WatchTimeMinutes     0
StreamTimeMinutes    0
PeakViewers          0
AverageViewers       0
Followers            0
FollowersGained      0
ViewsGained          0
dtype: int64
Data Types: Channel              object
Language             object
Partnered            object
Mature               object
WatchTimeMinutes      int64
StreamTimeMinutes     int64
PeakViewers           int64
AverageViewers        int64
Followers             int64
FollowersGained       int64
ViewsGained           int64
dtype: object


In [9]:

# Correlation heatmap for numerical features
plt.figure(figsize=(10, 8))
correlation_matrix = train_df[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()


In [10]:

train_df['Partnered'] = train_df['Partnered'].astype('category')
train_df['Mature'] = train_df['Mature'].astype('category')

# Violin plots for categorical features
for feature in ['Language', 'Partnered', 'Mature']:
    plt.figure(figsize=(10, 6))
    sns.violinplot(x=train_df[feature], y=train_df['AverageViewers'])
    plt.title(f'Violin Plot of {feature} vs AverageViewers')
    plt.show()


In [11]:
for feature in train_df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(train_df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()


In [19]:
dagshub.init("Group12_Prog_DB_project", "bhargavdvs2001", mlflow=True)
mlflow.start_run()


<ActiveRun: >

In [20]:
# Set environment variables for Dagshub authentication
os.environ['MLFLOW_TRACKING_USERNAME'] = 'bhargavdvs2001'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '15a3abd42c2e09e5316f74aa6358e392e4a25e34'

# Set the MLflow tracking URI to point to Dagshub
mlflow.set_tracking_uri("https://dagshub.com/bhargavdvs2001/Group12_Prog_DB_project.mlflow")

# Set your experiment name
experiment_name = "twitch_regression_experiment"
mlflow.set_experiment(experiment_name)

class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])
        self.categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', self.numeric_transformer, numerical_features),
                ('cat', self.categorical_transformer, categorical_features)
            ]
        )
        
    def fit(self, X, y=None):
        self.preprocessor.fit(X)
        return self
    
    def transform(self, X):
        return self.preprocessor.transform(X)


'''X_train = preprocessor.fit_transform(train_df)
y_train = train_df['AverageViewers']
X_test = test_df.drop(columns=['AverageViewers'])
y_test = test_df['AverageViewers']'''

categorical_features = ['Channel', 'Language', 'Partnered', 'Mature']
numerical_features = ['WatchTimeMinutes', 'StreamTimeMinutes', 'PeakViewers', 'Followers', 'FollowersGained', 'ViewsGained']

# Convert 'Partnered' and 'Mature' to categorical
df['Partnered'] = df['Partnered'].astype('category')
df['Mature'] = df['Mature'].astype('category')

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Partnered'], random_state=42)
X_train = train_df.drop(columns=['AverageViewers'])
y_train = train_df['AverageViewers']
X_test = test_df.drop(columns=['AverageViewers'])
y_test = test_df['AverageViewers']

# Initialize the data preprocessor
preprocessor = DataPreprocessor()



In [33]:
# Define models and parameter grids for hyperparameter tuning
models_params = {
    'LinearRegression': (LinearRegression(), {}),
    'Ridge': (Ridge(), {'regressor__alpha': [0.1, 1.0, 10.0]}),
    'Lasso': (Lasso(), {'regressor__alpha': [0.1, 1.0, 10.0]}),
    'RandomForestRegressor': (RandomForestRegressor(), {'regressor__n_estimators': [50, 100, 200], 'regressor__max_depth': [10, 20, 30]}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(), {'regressor__n_estimators': [50, 100, 200], 'regressor__learning_rate': [0.01, 0.1, 0.2]}),
    'SVR': (SVR(), {'regressor__C': [0.1, 1.0, 10.0], 'regressor__gamma': [0.01, 0.1, 1.0]})
}

# Train and log baseline models
for model_name, (model, params) in models_params.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Perform GridSearchCV if parameters are defined
    if params:
        grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = pipeline.fit(X_train, y_train)
        best_params = {}
    
    # Predictions and metrics
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Log metrics to MLflow
    with mlflow.start_run():
        mlflow.log_param("model", model_name)
        mlflow.log_params(best_params)
        mlflow.log_metric("mean_absolute_error", mae)
        mlflow.log_metric("mean_squared_error", mse)


2024/05/17 07:32:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '296aaa5b75604cc0aa2dde28899b466b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/17 07:37:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f12f191c5b194a1cad41e3206ed86c56', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/17 07:43:17 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2024/05/17 07:43:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0b401d6cb6ac4cae8cfd309331f36a53', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/17 07:50:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2024/05/17 07:51:04 INF

In [34]:
mlflow.autolog()

2024/05/17 08:48:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/05/17 08:48:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/05/17 08:48:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2024/05/17 08:48:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [36]:
# Set up the tracking URI
mlflow.set_tracking_uri("https://dagshub.com/bhargavdvs2001/Group12_Prog_DB_project.mlflow")

# Fetch the experiment
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("twitch_regression_experiment")
experiment_id = experiment.experiment_id

# Fetch all runs in the experiment
runs = client.search_runs(experiment_ids=[experiment_id])

In [37]:
# Feature engineering with PCA
for n_components in [2, 5, 10]:
    pca = PCA(n_components=n_components, svd_solver='arpack')
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', pca),
        ('regressor', LinearRegression())
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    with mlflow.start_run():
        mlflow.log_param("model", f"LinearRegression with PCA (n_components={n_components})")
        mlflow.log_metric("mean_absolute_error", mae)
        mlflow.log_metric("mean_squared_error", mse)

2024/05/17 08:48:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '49f153010a374f5eb11f0cc71c35f7fa', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/17 09:00:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a064362a39e74bb192d18f28cb9f0e18', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/05/17 09:05:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd7b22b0e17f24acda0ca4e2e72a8ffc7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [38]:

# Fetch the experiment
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("twitch_regression_experiment")
experiment_id = experiment.experiment_id

# Fetch all runs in the experiment
runs = client.search_runs(experiment_ids=[experiment_id])

# Extract metrics from runs
metrics = []
for run in runs:
    # Using get method with defaults to avoid KeyError
    model_name = run.data.params.get("model", "Unknown Model")
    mae = run.data.metrics.get("mean_absolute_error", float('inf'))  # Use inf to indicate missing data
    mse = run.data.metrics.get("mean_squared_error", float('inf'))
    
    run_data = {
        "model": model_name,
        "mean_absolute_error": mae,
        "mean_squared_error": mse
    }
    metrics.append(run_data)

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics)

if not metrics_df.empty:
    # Plot MAE comparison
    plt.figure(figsize=(12, 6))
    sns.barplot(data=metrics_df, x="model", y="mean_absolute_error")
    plt.title("Mean Absolute Error Comparison")
    plt.xticks(rotation=45)
    plt.show()

    # Plot MSE comparison
    plt.figure(figsize=(12, 6))
    sns.barplot(data=metrics_df, x="model", y="mean_squared_error")
    plt.title("Mean Squared Error Comparison")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No data available for plotting.")


In [40]:
# Check for the best model based on MAE
best_model_mae = metrics_df.loc[metrics_df['mean_absolute_error'].idxmin()]
print("Best model based on MAE:\n", best_model_mae)

# Check for the best model based on MSE
best_model_mse = metrics_df.loc[metrics_df['mean_squared_error'].idxmin()]
print("Best model based on MSE:\n", best_model_mse)


Best model based on MAE:
 model                  DecisionTreeRegressor
mean_absolute_error                     0.00
mean_squared_error                      0.00
Name: 50, dtype: object
Best model based on MSE:
 model                  DecisionTreeRegressor
mean_absolute_error                     0.00
mean_squared_error                      0.00
Name: 50, dtype: object


In [41]:
# Set up the tracking URI
mlflow.set_tracking_uri("https://dagshub.com/bhargavdvs2001/Group12_Prog_DB_project.mlflow")

# Fetch the experiment
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("twitch_regression_experiment")
experiment_id = experiment.experiment_id

# Fetch all runs in the experiment
runs = client.search_runs(experiment_ids=[experiment_id])

In [42]:
# Fetch all runs in the experiment
runs = client.search_runs(experiment_ids=[experiment_id])

# Extract metrics from runs
metrics = []
for run in runs:
    # Using get method with defaults to avoid KeyError
    model_name = run.data.params.get("model", "Unknown Model")
    mae = run.data.metrics.get("mean_absolute_error", float('inf'))  # Use inf to indicate missing data
    mse = run.data.metrics.get("mean_squared_error", float('inf'))
    
    run_data = {
        "run_id": run.info.run_id,
        "model": model_name,
        "mean_absolute_error": mae,
        "mean_squared_error": mse
    }
    metrics.append(run_data)

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics)

# Ensure DataFrame is not empty before proceeding
if metrics_df.empty:
    print("No data available for evaluation.")
else:
    print("Fetched metrics:\n", metrics_df)

    # Identify the best model based on MAE
    best_model_mae = metrics_df.loc[metrics_df['mean_absolute_error'].idxmin()]
    best_run_id_mae = best_model_mae['run_id']
    print("Best model based on MAE:\n", best_model_mae)

    # Identify the best model based on MSE
    best_model_mse = metrics_df.loc[metrics_df['mean_squared_error'].idxmin()]
    best_run_id_mse = best_model_mse['run_id']
    print("Best model based on MSE:\n", best_model_mse)

    # Function to log the best model with retry logic for artifact download
    def log_best_model_with_retry(best_run_id, model_name, params, metrics):
        # Load the model from the best run with retry logic
        model_uri = f"runs:/{best_run_id}/model"
        retries = 5
        for i in range(retries):
            try:
                best_model = mlflow.sklearn.load_model(model_uri)
                break
            except RequestException as e:
                print(f"Attempt {i + 1} to load model from run ID {best_run_id} failed: {e}")
                if i < retries - 1:
                    time.sleep(2 ** i)  # Exponential backoff
                else:
                    print(f"Failed to load model after {retries} attempts.")
                    return


Fetched metrics:
                                run_id  \
0    7f0a66cbf90c4ca0bc0209188210e6ff   
1    d7b22b0e17f24acda0ca4e2e72a8ffc7   
2    30830011da714f91ad17591249157f07   
3    a064362a39e74bb192d18f28cb9f0e18   
4    5dd16a82f8154a1893b14daf2275436f   
..                                ...   
335  ce31af43d4f043e2b7e97f3d0fa57d94   
336  4c6c721df4c74f35ba96f936ce8aaf6e   
337  eeba28bf5ee44c25bc1752972904c3d3   
338  45ebb78e83684af9bf5c5421bdfd2679   
339  d6afc55b350342eb9a6561bca387692f   

                                           model  mean_absolute_error  \
0    LinearRegression with PCA (n_components=10)              2308.83   
1                                  Unknown Model                  inf   
2     LinearRegression with PCA (n_components=5)              2334.09   
3                                  Unknown Model                  inf   
4     LinearRegression with PCA (n_components=2)              2455.71   
..                                           ...   

In [43]:
mlflow.autolog()


2024/05/17 09:17:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/05/17 09:17:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/05/17 09:17:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2024/05/17 09:17:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [44]:

# Set up the tracking URI
mlflow.set_tracking_uri("https://dagshub.com/bhargavdvs2001/Group12_Prog_DB_project.mlflow")

# Fetch the experiment
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("twitch_regression_experiment")
experiment_id = experiment.experiment_id

# Fetch all runs in the experiment
runs = client.search_runs(experiment_ids=[experiment_id])

# Extract metrics from runs
metrics = []
for run in runs:
    # Using get method with defaults to avoid KeyError
    model_name = run.data.params.get("model", "Unknown Model")
    mae = run.data.metrics.get("mean_absolute_error", float('inf'))  # Use inf to indicate missing data
    mse = run.data.metrics.get("mean_squared_error", float('inf'))
    
    run_data = {
        "run_id": run.info.run_id,
        "model": model_name,
        "mean_absolute_error": mae,
        "mean_squared_error": mse
    }
    metrics.append(run_data)

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics)

# Ensure DataFrame is not empty before proceeding
if metrics_df.empty:
    print("No data available for evaluation.")
else:
    print("Fetched metrics:\n", metrics_df)

    # Identify the best model based on MAE
    best_model_mae = metrics_df.loc[metrics_df['mean_absolute_error'].idxmin()]
    best_run_id_mae = best_model_mae['run_id']
    print("Best model based on MAE:\n", best_model_mae)

    # Identify the best model based on MSE
    best_model_mse = metrics_df.loc[metrics_df['mean_squared_error'].idxmin()]
    best_run_id_mse = best_model_mse['run_id']
    print("Best model based on MSE:\n", best_model_mse)


Fetched metrics:
                                run_id  \
0    7f0a66cbf90c4ca0bc0209188210e6ff   
1    d7b22b0e17f24acda0ca4e2e72a8ffc7   
2    30830011da714f91ad17591249157f07   
3    a064362a39e74bb192d18f28cb9f0e18   
4    5dd16a82f8154a1893b14daf2275436f   
..                                ...   
335  ce31af43d4f043e2b7e97f3d0fa57d94   
336  4c6c721df4c74f35ba96f936ce8aaf6e   
337  eeba28bf5ee44c25bc1752972904c3d3   
338  45ebb78e83684af9bf5c5421bdfd2679   
339  d6afc55b350342eb9a6561bca387692f   

                                           model  mean_absolute_error  \
0    LinearRegression with PCA (n_components=10)              2308.83   
1                                  Unknown Model                  inf   
2     LinearRegression with PCA (n_components=5)              2334.09   
3                                  Unknown Model                  inf   
4     LinearRegression with PCA (n_components=2)              2455.71   
..                                           ...   