In [None]:
# Run this cell to import the modules you require
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
weather = pd.read_csv("london_weather.csv")
summary = pd.DataFrame({
    "Column": weather.columns,
    "DataType": weather.dtypes,
    "NonNullCount": weather.notnull().sum()
})
print(summary)

In [None]:
weather['date'] = pd.to_datetime(weather['date'], format='%Y%m%d', errors='coerce')
print(weather.dtypes)

In [None]:
weather['day'] = weather['date'].dt.day
weather['month'] = weather['date'].dt.month
weather['year'] = weather['date'].dt.year
weather = weather.drop(columns=['date'])

In [None]:
sns.lineplot(x='month', y='mean_temp', data=weather)

In [None]:
sns.heatmap(data=weather.corr(), annot=True)

In [None]:
weather = weather.dropna(subset=['mean_temp'])
X = weather[["day", "month", "cloud_cover", "precipitation", "global_radiation", "pressure", "snow_depth"]]
y = weather['mean_temp']
X = X[y.notna()]
y = y[y.notna()]
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)
print(X_train_preprocessed.shape, X_test_preprocessed.shape)

In [None]:
tree_depths = [2, 5, 10, 20]
forest_depths = [2, 5, 10, 20]
n_estimators_list = [50, 100]
mlflow.set_experiment("London_Mean_Temperature_Prediction")

In [None]:
with mlflow.start_run(run_name="Linear_Regression"):
    lr = LinearRegression()
    lr.fit(X_train_preprocessed, y_train)
    y_pred = lr.predict(X_test_preprocessed)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    mlflow.sklearn.log_model(lr, name="model")
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)
    print(f"Linear Regression RMSE: {rmse}")

In [None]:
for idx, depth in enumerate(tree_depths):
    run_name = f"DecisionTree_depth_{depth}"
    with mlflow.start_run(run_name=run_name):
        dt = DecisionTreeRegressor(max_depth=depth, random_state=42)
        dt.fit(X_train_preprocessed, y_train)
        y_pred = dt.predict(X_test_preprocessed)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        mlflow.sklearn.log_model(dt, name="model")
        mlflow.log_param("model_type", "DecisionTreeRegressor")
        mlflow.log_param("max_depth", depth)
        mlflow.log_metric("rmse", rmse)
        print(f"Decision Tree (depth={depth}) RMSE: {rmse}")

In [None]:
for idx, depth in enumerate(forest_depths):
    for n_estimators in n_estimators_list:
        run_name = f"RandomForest_depth_{depth}_estimators_{n_estimators}"
        with mlflow.start_run(run_name=run_name):
            rf = RandomForestRegressor(max_depth=depth, n_estimators=n_estimators, random_state=42)
            rf.fit(X_train_preprocessed, y_train)
            y_pred = rf.predict(X_test_preprocessed)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))

            mlflow.sklearn.log_model(rf, name="model")
            mlflow.log_param("model_type", "RandomForestRegressor")
            mlflow.log_param("max_depth", depth)
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_metric("rmse", rmse)
            print(f"Random Forest (depth={depth}, estimators={n_estimators}) RMSE: {rmse}")

In [None]:
experiment_id = "994203647320879215"
experiment_results = mlflow.search_runs(experiment_ids=[experiment_id])
print(experiment_results.head())

In [None]:
experiment_results[['tags.mlflow.runName', 'metrics.rmse']].head()
best_runs = experiment_results.sort_values("metrics.rmse")
best_runs[['tags.mlflow.runName', 'metrics.rmse']]