# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import random
import os
import time
from sklearn.model_selection import train_test_split
import boto3
import re
import sagemaker
from sagemaker.estimator import Estimator
from datetime import datetime
from time import gmtime, strftime
from boto3.dynamodb.conditions import Key
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# 1.0 Collect

In [2]:
session = boto3.Session()
dynamodb = session.resource('dynamodb')
table = dynamodb.Table('WWUserData')
response = table.scan()
df_loaded = pd.DataFrame(response['Items'])

In [3]:
df_loaded = df_loaded.rename(columns = {'latitude': 'LATITUDE'
                                        , 'longitude': 'LONGITUDE'
                                        , 'timestamp': 'TIMESTAMP'
                                        , 'user_answer': 'USER_ANSWER'
                                        , 'if_wrong_answer': 'IF_WRONG_ANSWER'
                                        , 'answer': 'PREDICT_WEATHER_CATEGORY'})

# 2.0 Data Cleaning

In [4]:
df_cleaned = df_loaded.copy()

## 2.1 Dtypes

In [5]:
df_cleaned = df_cleaned[~(df_loaded['USER_ANSWER'].isna())]
df_cleaned = df_cleaned[['LONGITUDE', 'LONGITUDE', 'TIMESTAMP', 'USER_ANSWER', 'IF_WRONG_ANSWER', 'PREDICT_WEATHER_CATEGORY']]
df_cleaned['IF_WRONG_ANSWER'] = df_cleaned['IF_WRONG_ANSWER'].str.upper()
df_cleaned['TIMESTAMP'] = df_cleaned['TIMESTAMP'].astype(int)
df_cleaned

Unnamed: 0,LONGITUDE,LONGITUDE.1,TIMESTAMP,USER_ANSWER,IF_WRONG_ANSWER,PREDICT_WEATHER_CATEGORY
1,-54.31367509090908,-54.31367509090908,1725703494835,False,CLOUDY,RAINY
2,-53.9190463,-53.9190463,1721578100537,False,SUNNY,SNOW
4,-54.3137456,-54.3137456,1724521237884,True,-,SUNNY
5,-54.3059728,-54.3059728,1721500718235,False,SUNNY,RAINY
8,-54.3059728,-54.3059728,1721500637576,False,SUNNY,RAINY
9,-38.3239011,-38.3239011,1724523494957,False,SUNNY,RAINY
10,-54.31370207716452,-54.31370207716452,1724105663069,True,-,CLOUDY
12,-54.31312966635555,-54.31312966635555,1724583076733,False,SUNNY,SNOW
14,-38.50500673600253,-38.50500673600253,1724522652150,False,SUNNY,RAINY
16,-51.1570263,-51.1570263,1724525675072,False,CLOUDY,RAINY


## 2.2 Columns

In [6]:
df_cleaned['PREDICT_WEATHER_CATEGORY'] = df_cleaned.apply(lambda x : x['IF_WRONG_ANSWER'] if x['IF_WRONG_ANSWER'] !=  '-' else x['PREDICT_WEATHER_CATEGORY'], axis = 1)
df_cleaned = df_cleaned.drop(columns = ['USER_ANSWER', 'IF_WRONG_ANSWER'])
df_cleaned

Unnamed: 0,LONGITUDE,LONGITUDE.1,TIMESTAMP,PREDICT_WEATHER_CATEGORY
1,-54.31367509090908,-54.31367509090908,1725703494835,CLOUDY
2,-53.9190463,-53.9190463,1721578100537,SUNNY
4,-54.3137456,-54.3137456,1724521237884,SUNNY
5,-54.3059728,-54.3059728,1721500718235,SUNNY
8,-54.3059728,-54.3059728,1721500637576,SUNNY
9,-38.3239011,-38.3239011,1724523494957,SUNNY
10,-54.31370207716452,-54.31370207716452,1724105663069,CLOUDY
12,-54.31312966635555,-54.31312966635555,1724583076733,SUNNY
14,-38.50500673600253,-38.50500673600253,1724522652150,SUNNY
16,-51.1570263,-51.1570263,1724525675072,CLOUDY


## 2.3 Transformations

In [7]:
df_transformed = df_cleaned.copy()

In [8]:
index_to_class = {0: 'CLOUDY', 1: 'FOG', 2: 'RAINY', 3: 'SNOW', 4: 'SUNNY'}
CLASS_INDEX = {'CLOUDY': 0, 'FOG': 1, 'RAINY': 2, 'SNOW': 3, 'SUNNY': 4}

df_transformed['PREDICT_WEATHER_CATEGORY'] = df_transformed['PREDICT_WEATHER_CATEGORY'].map(CLASS_INDEX)
df_transformed

Unnamed: 0,LONGITUDE,LONGITUDE.1,TIMESTAMP,PREDICT_WEATHER_CATEGORY
1,-54.31367509090908,-54.31367509090908,1725703494835,0
2,-53.9190463,-53.9190463,1721578100537,4
4,-54.3137456,-54.3137456,1724521237884,4
5,-54.3059728,-54.3059728,1721500718235,4
8,-54.3059728,-54.3059728,1721500637576,4
9,-38.3239011,-38.3239011,1724523494957,4
10,-54.31370207716452,-54.31370207716452,1724105663069,0
12,-54.31312966635555,-54.31312966635555,1724583076733,4
14,-38.50500673600253,-38.50500673600253,1724522652150,4
16,-51.1570263,-51.1570263,1724525675072,0


# 3. 0 Feature Engineering

## 3.1 Time

In [9]:
def transform_dataset(df_transformed):
    """
    Transforms the original DataFrame by converting UNIX timestamps to datetime and extracting additional time-based features.

    Parameters:
    df_transformed (pd.DataFrame): The original DataFrame containing a 'TIMESTAMP' column with UNIX timestamps.

    Returns:
    pd.DataFrame: The transformed DataFrame with additional columns for date and time features.
    """

    def convert_unix_timestamp(timestamp):
        """
        Converts a UNIX timestamp to a datetime object.

        Parameters:
        timestamp (int): The UNIX timestamp to convert.

        Returns:
        datetime: The corresponding datetime object.
        """
        return datetime.fromtimestamp(timestamp/1000)

    # Convert the UNIX timestamps to datetime objects
    df_transformed['DATE'] = df_transformed['TIMESTAMP'].apply(convert_unix_timestamp)

    # Extract additional time-based features from the 'DATE' column
    df_transformed['DAY'] = df_transformed['DATE'].dt.day
    df_transformed['MONTH'] = df_transformed['DATE'].dt.month
    df_transformed['YEAR'] = df_transformed['DATE'].dt.year
    df_transformed['WEEK_OF_YEAR'] = df_transformed['DATE'].dt.isocalendar().week
    df_transformed['HOUR'] = df_transformed['DATE'].dt.hour
    
    # Excluding unecessary columns
    df_transformed = df_transformed.drop(columns = ['DATE', 'TIMESTAMP'])

    return df_transformed

dataset_fe = transform_dataset(df_transformed)
dataset_fe

Unnamed: 0,LONGITUDE,LONGITUDE.1,PREDICT_WEATHER_CATEGORY,DAY,MONTH,YEAR,WEEK_OF_YEAR,HOUR
1,-54.31367509090908,-54.31367509090908,0,7,9,2024,36,10
2,-53.9190463,-53.9190463,4,21,7,2024,29,16
4,-54.3137456,-54.3137456,4,24,8,2024,34,17
5,-54.3059728,-54.3059728,4,20,7,2024,29,18
8,-54.3059728,-54.3059728,4,20,7,2024,29,18
9,-38.3239011,-38.3239011,4,24,8,2024,34,18
10,-54.31370207716452,-54.31370207716452,0,19,8,2024,34,22
12,-54.31312966635555,-54.31312966635555,4,25,8,2024,34,10
14,-38.50500673600253,-38.50500673600253,4,24,8,2024,34,18
16,-51.1570263,-51.1570263,0,24,8,2024,34,18


# 4.0 Feature Split

In [10]:
# Assuming you already have a dataset where 'target' is your target variable
# and 'features' are your input features.

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset_fe.drop(columns=['PREDICT_WEATHER_CATEGORY']), 
                                                    dataset_fe['PREDICT_WEATHER_CATEGORY'], 
                                                    test_size=0.2, 
                                                    random_state=42)

# Concatenar X_train e y_train em um único DataFrame
train_data = pd.concat([y_train, X_train], axis=1)

# Concatenar X_test e y_test em um único DataFrame
test_data = pd.concat([y_test, X_test], axis=1)

# 5.0 Model Training

In [11]:
model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],}

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

# Treinar o modelo com Grid Search
grid_search.fit(X_train, y_train)

# Exibir os melhores hiperparâmetros encontrados
print("Melhores hiperparâmetros encontrados: ", grid_search.best_params_)

# Fazer previsões com o melhor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
print("Acurácia do modelo: ", accuracy)
print("Relatório de Classificação:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits




Melhores hiperparâmetros encontrados:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Acurácia do modelo:  0.75
Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           4       0.75      1.00      0.86         3

    accuracy                           0.75         4
   macro avg       0.38      0.50      0.43         4
weighted avg       0.56      0.75      0.64         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 6.0 Model Save

In [12]:
bucket_name = 'weatherwizard-ml-model'
prefix = 'general-weatherwizard-ml-model/'
s3 = boto3.client('s3')

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
folders = [content['Prefix'].split('/')[-2] for content in response.get('CommonPrefixes', [])]

max_version = 0
for folder in folders:
    match = re.match(r'v(\d+)', folder)
    if match:
        version_number = int(match.group(1))
        if version_number > max_version:
            max_version = version_number

new_version = max_version + 1
new_folder_name = f'v{new_version}/'

s3.put_object(Bucket=bucket_name, Key=prefix + new_folder_name)

print(f'New folder: {new_folder_name}')

New folder: v2/


In [13]:
model_file_path = f'model_v{new_version}.joblib'
joblib.dump(best_model, model_file_path)
s3.upload_file(model_file_path, bucket_name, f'{prefix}{new_folder_name}{model_file_path}')
os.remove(model_file_path)

print(f'Model saved: s3://{bucket_name}/{prefix}{new_folder_name}{model_file_path}')

Model saved: s3://weatherwizard-ml-model/general-weatherwizard-ml-model/v2/model_v2.joblib


# 7.0 Load Model

In [14]:
bucket_name = 'weatherwizard-ml-model'
prefix = 'general-weatherwizard-ml-model/'
modelo_file_name = f'model_v{new_version}.joblib'
modelo_file_path = f'{prefix}v{new_version}/{modelo_file_name}'

local_model_path = f'/tmp/{modelo_file_name}'
s3.download_file(bucket_name, modelo_file_path, local_model_path)

model = joblib.load(local_model_path)

os.remove(local_model_path)
X_test['PREDICTION'] = model.predict(X_test)
X_test['PREDICTION'] = X_test['PREDICTION'].map(index_to_class)

X_test


Unnamed: 0,LONGITUDE,LONGITUDE.1,DAY,MONTH,YEAR,WEEK_OF_YEAR,HOUR,PREDICTION
1,-54.31367509090908,-54.31367509090908,7,9,2024,36,10,SUNNY
2,-53.9190463,-53.9190463,21,7,2024,29,16,SUNNY
14,-38.50500673600253,-38.50500673600253,24,8,2024,34,18,SUNNY
9,-38.3239011,-38.3239011,24,8,2024,34,18,SUNNY
