In [None]:
# In this Jupyter Notebook we show how to use the Boto3 library to:
#       - Make a connection to a MinIO client and bucket.
#       - Download a specific file.
#       - Use the now local file in a "random forrest example".
#       - Create a pickle out of the model.
#       - Upload the pickle back to the bucket.

In [4]:
# #!/usr/bin/env/python
import os
import boto3
from botocore.client import Config
import pandas as pd

s3 = boto3.resource('s3',
                    endpoint_url='https://s3.deltares.nl',
                    aws_access_key_id='<<ACCESS KEY ID>>',
                    aws_secret_access_key='<<SECRET ACCESS KEY>>',
                    config=Config(signature_version='s3v4'),
                    region_name='eu-west-1')

bucket_name = 'minio-training'
file_path = 'timeseries_backup.json'

s3.Bucket(bucket_name).download_file(file_path, 'timeseries_backup.json')

In [2]:
# Load dataset into Pandas DataFrame
dataset = pd.read_csv(file_path)

# Explore dataset
print(dataset.head())  # View first few rows of data
print(dataset.describe())  # Statistical summary of the dataset

   Student_id  Age      Grade Employed  marks
0           1   19  1st Class      yes     29
1           2   20  2nd Class       no     41
2           3   18  1st Class       no     57
3           4   21  2nd Class       no     29
4           5   19  1st Class       no     57
       Student_id         Age       marks
count  232.000000  232.000000  232.000000
mean   116.500000   19.896552   58.689655
std     67.116814    1.030944   23.200580
min      1.000000   18.000000   20.000000
25%     58.750000   19.000000   37.000000
50%    116.500000   20.000000   59.500000
75%    174.250000   21.000000   77.000000
max    232.000000   22.000000   98.000000


In [3]:
# Perform data preprocessing (e.g., handle missing values, encode categorical variables)
# Preprocessing steps here...

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

dataset = dataset.drop('Employed', axis=1)

# Prepare data for modeling
X = dataset.drop('Grade', axis=1)  # Features (excluding target column)
y = dataset['Age']  # Target variable

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict using the trained model
predictions = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, predictions)
print(f'Model Accuracy: {accuracy}')

Model Accuracy: 1.0


In [4]:
import joblib

# Save the trained model using joblib
model_file_path = 'model.pkl'  # Replace with your desired file path and name

joblib.dump(model, model_file_path)

['model.pkl']

In [5]:
# Upload the model file to MinIO bucket
s3.Bucket(bucket_name).upload_file(model_file_path, model_file_path)