In [None]:

except Exception as e:
    print(f"An error occurred: {e}")


stationary_bikes = pd.read_csv("local-file-name")

from sklearn.model_selection import train_test_split

# picks what columns to keep and target variable

X, y = (
    stationary_bikes[
        [
            "IsOpen",
            "Long",
            "Lat",
            "Year",
            "Month",
            "Day",
            "Hour",
            "Temperature",
            "Humidity",
            "Wind_Speed",
            "Precipitation",
            "Visibility",
            "Snowfall",
            "IsWeekend",
        ]
    ],
    stationary_bikes["TotalAvailableBikes"],
)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

val = pd.concat([pd.Series(y_val, index=X_val.index, name='TotalAvailableBikes', dtype=int), X_val], axis=1)
train = pd.concat([pd.Series(y_train, index=X_train.index, name='TotalAvailableBikes', dtype=int), X_train], axis=1)
test = pd.concat([pd.Series(y_test, index=X_test.index, name='TotalAvailableBikes', dtype=int), X_test], axis=1)

# saves and uploads train, test & val to s3 bucket

train.to_csv('train.csv', index=False, header=True)
test.to_csv('test.csv', index=False, header=True)
val.to_csv('val.csv', index=False, header=True)

import sagemaker, boto3, os

bucket = "demo-sagemaker-bike-availability-prediction"

boto3.Session().resource('s3').Bucket(bucket).Object('data/train.csv').upload_file('train.csv')

boto3.Session().resource('s3').Bucket(bucket).Object('data/test.csv').upload_file('test.csv')

boto3.Session().resource('s3').Bucket(bucket).Object('data/val.csv').upload_file('validation.csv')

import sagemaker

region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

%%writefile rf_training_job.py

import argparse
import os
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import joblib

if __name__ == '__main__':
    
    # Argument parsing for SageMaker
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    args, _ = parser.parse_known_args()
    
    # loads data from s3 bucket
    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'))
    
    # split data into x and y 
    X_train, y_train = train_data.drop('TotalAvailableBikes', axis=1), train_data['TotalAvailableBikes']
    
    # removes all rows with nan values

    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    
    # saves model
    joblib.dump(model, 'decision_tree_model.pkl')

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn

role = get_execution_role()

script_path = 'rf_training_job.py'  

sklearn_estimator = SKLearn(
    entry_point=script_path,
    role=role,
    instance_type="ml.m5.2xlarge",
    framework_version="0.23-1"
)

train_input = sagemaker.inputs.TrainingInput(s3_data=f"s3://{bucket}/data/train.csv", content_type="csv")
val_input = sagemaker.inputs.TrainingInput(s3_data=f"s3://{bucket}/data/val.csv", content_type="csv")

sklearn_estimator.fit({'train': train_input, 'validation': val_input})

import io

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Existing metric
print(f"Decision Tree MSE: {mean_squared_error(y_test, y_pred)}")

# Additional metrics
print(f"Decision Tree MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"Decision Tree R2 Score: {r2_score(y_test, y_pred)}")

# Save metrics to S3
metrics_data = {
    "MSE": [mean_squared_error(y_test, y_pred)],
    "MAE": [mean_absolute_error(y_test, y_pred)],
    "R2": [r2_score(y_test, y_pred)]
}

# Convert dictionary to DataFrame
metrics_df = pd.DataFrame(metrics_data)  # Note the corrected class name

# Convert your DataFrame to CSV in-memory
csv_buffer = io.StringIO()
metrics_df.to_csv(csv_buffer, index=False)

# Upload the in-memory CSV directly to the S3 bucket's specific folder
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'data/prediction/decision-tree-metrics/metrics.csv').put(Body=csv_buffer.getvalue())

# saves model to s3 folder "decision-tree-model"

import pickle

# Step 1: Save the trained model to a pickle file
with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Step 2: Upload the pickle file to S3
s3 = boto3.client('s3')
s3.upload_file('decision_tree_model.pkl', 'demo-sagemaker-bike-availability-prediction', 'data/decision-tree-model/decision_tree_model.pkl')


In [None]:
# Train the model with RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"RF MSE: {mean_squared_error(y_test, y_pred)}")
print(f"RF MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"RF R2 Score: {r2_score(y_test, y_pred)}")

# Save metrics to S3
metrics_data = {
    "MSE": [mean_squared_error(y_test, y_pred)],
    "MAE": [mean_absolute_error(y_test, y_pred)],
    "R2": [r2_score(y_test, y_pred)]
}
metrics_df = pd.DataFrame(metrics_data)
csv_buffer = io.StringIO()
metrics_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'data/prediction/rf-metrics/metrics.csv').put(Body=csv_buffer.getvalue())

# Save the model to S3
model_filename = 'rf_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
s3.upload_file(model_filename, bucket, 'data/rf-model/rf_model.pkl')


In [None]:
    # Evaluate
    y_pred = model.predict(X_test)
    metrics = {
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }
    print(metrics)
    
    # Convert metrics to DataFrame and save to CSV
    metrics_df = pd.DataFrame([metrics])
    metrics_csv_path = os.path.join(args.model_dir, 'metrics.csv')
    metrics_df.to_csv(metrics_csv_path, index=False)
    
    # Initialize the boto3 S3 resource
    s3 = boto3.resource('s3')

    # Upload metrics to S3
    try:
        s3.Object('demo-sagemaker-bike-availability-prediction', 'prediction/rf-metrics/metrics.csv').upload_file(metrics_csv_path)
    except Exception as e:
        print(f"Error uploading metrics to S3: {e}")

    # Save model locally
    local_model_path = os.path.join(args.model_dir, "rf_model.pkl")
    joblib.dump(model, local_model_path)

    # Compress the model into a .tar.gz file (for deployment)
    with tarfile.open(os.path.join(args.model_dir, 'rf_model.tar.gz'), 'w:gz') as tar:
        tar.add(local_model_path, arcname='rf_model.pkl')

    # Upload compressed model to S3
    compressed_model_path = os.path.join(args.model_dir, 'rf_model.tar.gz')
    try:
        s3.Object('demo-sagemaker-bike-availability-prediction', 'rf-model/rf_model.tar.gz').upload_file(compressed_model_path)
    except Exception as e:
        print(f"Error uploading to S3: {e}")

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
# import random forest
from sklearn.ensemble import RandomForestRegressor

In [33]:
stationary_bikes = pd.read_csv("../../data/data_warehouse/StationaryStations.csv")
stationary_bikes.head()

Unnamed: 0,AvailableBikes,StationId,Distance,stationId,IsOpen,BikeIds,LastUpdate,Long,Lat,Name,...,Minute_x,timestamp,Temperature,Humidity,Wind_Speed,Precipitation,Visibility,Snowfall,TotalAvailableBikes,IsWeekend
0,1,270950259,1567,Norra Allégatan,True,['710976'],/Date(1694427009437+0200)/,11.953708,57.700368,Norra Allégatan,...,10,2023-09-11 10:00:00,19.7,74,11.9,0.0,24140.0,0.0,1,0
1,1,270950259,1567,Norra Allégatan,True,['710976'],/Date(1694427609379+0200)/,11.953708,57.700368,Norra Allégatan,...,20,2023-09-11 10:00:00,19.7,74,11.9,0.0,24140.0,0.0,1,0
2,1,270950259,1567,Norra Allégatan,True,['710976'],/Date(1694428209242+0200)/,11.953708,57.700368,Norra Allégatan,...,30,2023-09-11 10:00:00,19.7,74,11.9,0.0,24140.0,0.0,1,0
3,1,270950259,1567,Norra Allégatan,True,['710976'],/Date(1694428809480+0200)/,11.953708,57.700368,Norra Allégatan,...,40,2023-09-11 10:00:00,19.7,74,11.9,0.0,24140.0,0.0,1,0
4,1,270950259,1567,Norra Allégatan,True,['710976'],/Date(1694429409702+0200)/,11.953708,57.700368,Norra Allégatan,...,50,2023-09-11 10:00:00,19.7,74,11.9,0.0,24140.0,0.0,1,0


In [34]:
stationary_bikes.columns

Index(['AvailableBikes', 'StationId', 'Distance', 'stationId', 'IsOpen',
       'BikeIds', 'LastUpdate', 'Long', 'Lat', 'Name', 'Year', 'Month', 'Day',
       'Hour', 'Minute_x', 'timestamp', 'Temperature', 'Humidity',
       'Wind_Speed', 'Precipitation', 'Visibility', 'Snowfall',
       'TotalAvailableBikes', 'IsWeekend'],
      dtype='object')

In [35]:
X, y = (
    stationary_bikes[
        [
            "IsOpen",
            "Long",
            "Lat",
            "Year",
            "Month",
            "Day",
            "Hour",
            "Minute_x",
            "Temperature",
            "Humidity",
            "Wind_Speed",
            "Precipitation",
            "Visibility",
            "Snowfall",
            "IsWeekend",
        ]
    ],
    stationary_bikes["TotalAvailableBikes"],
)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((766243, 15), (191561, 15))

In [37]:
model_tree_reg = DecisionTreeRegressor()

In [38]:
model_tree_reg.fit(X_train, y_train)

In [39]:
# fig, ax = plt.figure(figsize=(16,8), dpi=100), plt.axes()
# tree.plot_tree(model_tree_reg, filled=True, ax=ax, feature_names=list(X.columns), impurity=False, rounded=True);

In [40]:
# print(tree.export_text(model_tree_reg, feature_names = list(X_train.columns)))

In [41]:
y_pred = model_tree_reg.predict(X_test)

In [42]:
mean_absolute_error(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))

(0.359290252191208, 1.0990105275060467)

In [43]:
print(f"Decision Tree MSE: {mean_squared_error(y_test, y_pred)}")

Decision Tree MSE: 1.207824139569119


In [44]:
pairs = zip(y_test, y_pred)
formatted_pairs = ["Test: {:.2f}, Pred: {:.2f}".format(test, pred) for test, pred in pairs]
long_string = "; ".join(formatted_pairs)

# To print the long string
print(long_string)

Test: 7.00, Pred: 7.00; Test: 2.00, Pred: 2.00; Test: 10.00, Pred: 10.00; Test: 11.00, Pred: 11.00; Test: 10.00, Pred: 10.00; Test: 32.00, Pred: 31.00; Test: 8.00, Pred: 8.00; Test: 14.00, Pred: 14.00; Test: 12.00, Pred: 12.00; Test: 17.00, Pred: 17.00; Test: 7.00, Pred: 8.00; Test: 8.00, Pred: 8.00; Test: 6.00, Pred: 6.00; Test: 5.00, Pred: 5.00; Test: 2.00, Pred: 2.00; Test: 5.00, Pred: 5.00; Test: 5.00, Pred: 5.00; Test: 14.00, Pred: 14.00; Test: 7.00, Pred: 7.00; Test: 9.00, Pred: 9.00; Test: 6.00, Pred: 6.00; Test: 8.00, Pred: 8.00; Test: 15.00, Pred: 15.00; Test: 12.00, Pred: 12.00; Test: 1.00, Pred: 1.00; Test: 12.00, Pred: 12.00; Test: 15.00, Pred: 15.00; Test: 3.00, Pred: 4.00; Test: 15.00, Pred: 15.00; Test: 11.00, Pred: 11.00; Test: 8.00, Pred: 8.00; Test: 18.00, Pred: 18.00; Test: 10.00, Pred: 10.00; Test: 4.00, Pred: 3.00; Test: 4.00, Pred: 4.00; Test: 5.00, Pred: 5.00; Test: 4.00, Pred: 5.00; Test: 10.00, Pred: 8.00; Test: 11.00, Pred: 10.00; Test: 7.00, Pred: 7.00; Test:

In [45]:
print(y_test)

291247     7
515894     2
891078    10
167120    11
830511    10
          ..
346423     8
204709    23
850394    21
572656    14
460903     3
Name: TotalAvailableBikes, Length: 191561, dtype: int64


In [46]:
print(y_pred)

[ 7.  2. 10. ... 22. 14.  3.]


In [47]:
# Assuming y_test and y_pred are your test and prediction numpy arrays or lists
pairs = zip(y_test, y_pred)
formatted_pairs = ["Test: {:.2f}, Pred: {:.2f}".format(test, pred) for test, pred in pairs]
long_string = "; ".join(formatted_pairs)

# To print the long string
print(long_string)

Test: 7.00, Pred: 7.00; Test: 2.00, Pred: 2.00; Test: 10.00, Pred: 10.00; Test: 11.00, Pred: 11.00; Test: 10.00, Pred: 10.00; Test: 32.00, Pred: 31.00; Test: 8.00, Pred: 8.00; Test: 14.00, Pred: 14.00; Test: 12.00, Pred: 12.00; Test: 17.00, Pred: 17.00; Test: 7.00, Pred: 8.00; Test: 8.00, Pred: 8.00; Test: 6.00, Pred: 6.00; Test: 5.00, Pred: 5.00; Test: 2.00, Pred: 2.00; Test: 5.00, Pred: 5.00; Test: 5.00, Pred: 5.00; Test: 14.00, Pred: 14.00; Test: 7.00, Pred: 7.00; Test: 9.00, Pred: 9.00; Test: 6.00, Pred: 6.00; Test: 8.00, Pred: 8.00; Test: 15.00, Pred: 15.00; Test: 12.00, Pred: 12.00; Test: 1.00, Pred: 1.00; Test: 12.00, Pred: 12.00; Test: 15.00, Pred: 15.00; Test: 3.00, Pred: 4.00; Test: 15.00, Pred: 15.00; Test: 11.00, Pred: 11.00; Test: 8.00, Pred: 8.00; Test: 18.00, Pred: 18.00; Test: 10.00, Pred: 10.00; Test: 4.00, Pred: 3.00; Test: 4.00, Pred: 4.00; Test: 5.00, Pred: 5.00; Test: 4.00, Pred: 5.00; Test: 10.00, Pred: 8.00; Test: 11.00, Pred: 10.00; Test: 7.00, Pred: 7.00; Test:

#### Random forest


In [48]:
from sklearn.ensemble import RandomForestRegressor

In [49]:
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
random_forest.fit(X_train, y_train)

In [50]:
y_pred2 = random_forest.predict(X_test)

In [51]:
r2 = r2_score(y_test, y_pred)
print(f"R^2: {r2}")

R^2: 0.9737857011366374


In [52]:
r2 = r2_score(y_test, y_pred2)
print(f"R^2: {r2}")

R^2: 0.9827435102189792


In [53]:
mae = mean_absolute_error(y_test, y_pred2)

In [54]:
mae

0.39404290497875305

In [55]:
X_test.columns

Index(['IsOpen', 'Long', 'Lat', 'Year', 'Month', 'Day', 'Hour', 'Minute_x',
       'Temperature', 'Humidity', 'Wind_Speed', 'Precipitation', 'Visibility',
       'Snowfall', 'IsWeekend'],
      dtype='object')

In [56]:
# decision_tree_mae = mean_absolute_error(y_test, y_pred)
# random_forest_mae = mean_absolute_error(y_test, y_pred2)
# print(f"Decision Tree MSE: {decision_tree_mae}")
# print(f"Random Forest MSE: {random_forest_mae}")

In [57]:
# import os

# # Make the directory
# directory_path = "../../data/metrics"
# if not os.path.exists(directory_path):
#     os.makedirs(directory_path)

In [58]:
# df = pd.DataFrame(
#     {"Model": ["Decision Tree", "Random Forest"], "MAE": [decision_tree_mae, random_forest_mae]}
# )

# df.to_csv(f"{directory_path}/decision_tree_metrics.csv", index=False)

In [59]:
# asd

In [60]:
# plt.figure(figsize=(10, 6))


# plt.scatter(y_test, y_test, color='red', alpha=0.5, label='Actual Values')

# plt.scatter(y_test, y_pred2, color='blue', alpha=0.5, label='Predicted Values')

# plt.xlabel('Actual Values')
# plt.ylabel('Predicted/Actual Values')
# plt.title('Random Forest: Actual vs Predicted')
# plt.legend()


# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='green')

# plt.savefig(f'{directory_path}/random_forest_actual_vs_predicted_colors.png')

In [61]:
# plt.figure(figsize=(10, 6))

# # Plot actual values in red
# plt.scatter(y_test, y_test, color='red', alpha=0.5, label='Actual Values')

# # Plot predicted values in blue
# plt.scatter(y_test, y_pred, color='blue', alpha=0.5, label='Predicted Values')

# plt.xlabel('Actual Values')
# plt.ylabel('Predicted/Actual Values')
# plt.title('Decision Tree: Actual vs Predicted')
# plt.legend()

# # Diagonal line
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='green')

# # Save the plot
# plt.savefig(f'{directory_path}/decision_tree_actual_vs_predicted_colors.png')

In [86]:
molndalsbro = stationary_bikes[stationary_bikes["Name"] == "Skanstorget"]
molndalsbro.head()

Unnamed: 0,AvailableBikes,StationId,Distance,stationId,IsOpen,BikeIds,LastUpdate,Long,Lat,Name,...,Minute_x,timestamp,Temperature,Humidity,Wind_Speed,Precipitation,Visibility,Snowfall,TotalAvailableBikes,IsWeekend
884542,6,30921902,1730,Skanstorget,True,"['711713', '710945', '710934', '710800', '7107...",/Date(1691570361789+0200)/,11.958664,57.695921,Skanstorget,...,39,2023-08-09 08:00:00,12.5,88,24.8,1.3,16960.0,0.0,6,0
884543,6,30921902,1730,Skanstorget,True,"['711713', '710945', '710934', '710800', '7107...",/Date(1691570408591+0200)/,11.958664,57.695921,Skanstorget,...,40,2023-08-09 08:00:00,12.5,88,24.8,1.3,16960.0,0.0,6,0
884544,6,30921902,1730,Skanstorget,True,"['711713', '710945', '710934', '710800', '7107...",/Date(1691570500601+0200)/,11.958664,57.695921,Skanstorget,...,41,2023-08-09 08:00:00,12.5,88,24.8,1.3,16960.0,0.0,6,0
884545,6,30921902,1730,Skanstorget,True,"['711713', '710945', '710934', '710800', '7107...",/Date(1691571009358+0200)/,11.958664,57.695921,Skanstorget,...,50,2023-08-09 08:00:00,12.5,88,24.8,1.3,16960.0,0.0,6,0
884546,6,30921902,1730,Skanstorget,True,"['711713', '710945', '710934', '710800', '7107...",/Date(1691571609440+0200)/,11.958664,57.695921,Skanstorget,...,0,2023-08-09 09:00:00,12.7,84,20.9,1.5,13600.0,0.0,6,0


In [63]:
def convert_to_gusts_kmh(wind_speed_ms):
    wind_speed_kmh = wind_speed_ms * 3.6  # 1 m/s = 3.6 km/h
    return wind_speed_kmh

In [64]:
wind_speed_ms = 3
wind_speed_kmh = convert_to_gusts_kmh(wind_speed_ms)
print(f"The wind speed in gusts km/h is: {wind_speed_kmh}")

The wind speed in gusts km/h is: 10.8


In [65]:
def create_dataframe(
    IsOpen,
    Long,
    Lat,
    Year,
    Month,
    Day,
    Hour,
    Minute_x,
    Temperature,
    Humidity,
    Wind_Speed,
    Precipitation,
    Visibility,
    Snowfall,
    IsWeekend,
):
    df = pd.DataFrame(
        {
            "IsOpen": [IsOpen],
            "Long": [Long],
            "Lat": [Lat],
            "Year": [Year],
            "Month": [Month],
            "Day": [Day],
            "Hour": [Hour],
            "Minute_x": [Minute_x],
            "Temperature": [Temperature],
            "Humidity": [Humidity],
            "Wind_Speed": [Wind_Speed],
            "Precipitation": [Precipitation],
            "Visibility": [Visibility],
            "Snowfall": [Snowfall],
            "IsWeekend": [IsWeekend],
        }
    )
    return df

In [66]:
import requests
from datetime import datetime


def fetch_and_convert_weather(Long, Lat, Precipitation, Humidity, Visibility):
    response = requests.get(
        f"https://api.open-meteo.com/v1/forecast?latitude={Lat}&longitude={Long}&current_weather=true"
    )
    data = response.json()
    current_weather = data["current_weather"]
    print(current_weather)

    dt_obj = datetime.now()
    Year, Month, Day, Hour, Minute_x = (
        dt_obj.year,
        dt_obj.month,
        dt_obj.day,
        dt_obj.hour,
        dt_obj.minute,
    )

    windspeed_kmh = current_weather["windspeed"]

    Temperature = current_weather.get("temperature", 0)
    Humidity = Humidity
    Precipitation = Precipitation
    Visibility = Visibility
    Snowfall = 0
    IsWeekend = 0
    IsOpen = True

    df = create_dataframe(
        IsOpen,
        Long,
        Lat,
        Year,
        Month,
        Day,
        Hour,
        Minute_x,
        Temperature,
        Humidity,
        windspeed_kmh,
        Precipitation,
        Visibility,
        Snowfall,
        IsWeekend,
    )

    return df

In [90]:

aa = fetch_and_convert_weather(Long=11.958664, Lat=57.695921, Precipitation=0.4, Visibility=15000, Humidity=94)
a
,a

{'time': '2023-10-06T09:15', 'temperature': 14.8, 'windspeed': 32.4, 'winddirection': 226, 'is_day': 1, 'weathercode': 61}


Unnamed: 0,IsOpen,Long,Lat,Year,Month,Day,Hour,Minute_x,Temperature,Humidity,Wind_Speed,Precipitation,Visibility,Snowfall,IsWeekend
0,True,11.958664,57.695921,2023,10,6,11,28,14.8,94,32.4,0.4,15000,0,0


In [91]:
test_prediction = model_tree_reg.predict(aa)

In [92]:
test_prediction

array([8.])

In [71]:
asd

NameError: name 'asd' is not defined

#### Using grid search for tuning parameters and choosing the best


In [None]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

In [None]:
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3)

In [None]:
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV


n_estimators = [int(x) for x in np.linspace(start=50, stop=200, num=20)]
max_features = ["auto", "sqrt"]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap,
}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [50,
                  57,
                  65,
                  73,
                  81,
                  89,
                  97,
                  105,
                  113,
                  121,
                  128,
                  136,
                  144,
                  152,
                  160,
                  168,
                  176,
                  184,
                  192,
                  200]}


In [None]:
rf = RandomForestRegressor()

In [None]:
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)

In [None]:
rf_random.best_params_

{'n_estimators': 81,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [None]:
best_rf = RandomForestRegressor(
    n_estimators=81,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    max_depth=None,
    bootstrap=False,
)

In [None]:
best_rf.fit(X_train, y_train)

In [None]:
score = best_rf.score(X_test, y_test)
print(f"Test Score: {score}")

Test Score: 0.9275737853927002


In [None]:
y_pred_best_fit = best_rf.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_pred_best_fit)
print(f"R^2: {r2}")

R^2: 0.9275737853927002
