# Notebook de Teste

In [1]:
import pandas as pd
import os

In [2]:
DATA_DIR = "./Data/classificacao-datasets"
POLLUTION_DATASET_FILE_NAME = "updated_pollution_dataset.csv"

file_path = os.path.join(DATA_DIR, POLLUTION_DATASET_FILE_NAME)

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
else: 
     print("Arquivo não encontrado")

In [3]:
df.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,Moderate
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,Moderate
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,Moderate
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,Good
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,Good


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature,5000.0,30.02902,6.720661,13.4,25.1,29.0,34.0,58.6
Humidity,5000.0,70.05612,15.863577,36.0,58.3,69.8,80.3,128.1
PM2.5,5000.0,20.14214,24.554546,0.0,4.6,12.0,26.1,295.0
PM10,5000.0,30.21836,27.349199,-0.2,12.3,21.7,38.1,315.8
NO2,5000.0,26.4121,8.895356,7.4,20.1,25.3,31.9,64.9
SO2,5000.0,10.01482,6.750303,-6.2,5.1,8.0,13.725,44.9
CO,5000.0,1.500354,0.546027,0.65,1.03,1.41,1.84,3.72
Proximity_to_Industrial_Areas,5000.0,8.4254,3.610944,2.5,5.4,7.9,11.1,25.8
Population_Density,5000.0,497.4238,152.754084,188.0,381.0,494.0,600.0,957.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature                    5000 non-null   float64
 1   Humidity                       5000 non-null   float64
 2   PM2.5                          5000 non-null   float64
 3   PM10                           5000 non-null   float64
 4   NO2                            5000 non-null   float64
 5   SO2                            5000 non-null   float64
 6   CO                             5000 non-null   float64
 7   Proximity_to_Industrial_Areas  5000 non-null   float64
 8   Population_Density             5000 non-null   int64  
 9   Air Quality                    5000 non-null   object 
dtypes: float64(8), int64(1), object(1)
memory usage: 390.8+ KB


## MLFLOW

In [6]:
import mlflow
import mlflow.sklearn

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [8]:
import dagshub
dagshub.init(repo_owner='AurelioGuilherme', repo_name='AmbienteDeDesenvolvimento', mlflow=True)

In [9]:
mlflow.set_experiment("pollution_dataset_experiment")

2024/12/23 03:40:40 INFO mlflow.tracking.fluent: Experiment with name 'pollution_dataset_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/06b563545b744518b4537255a760af09', creation_time=1734936040619, experiment_id='0', last_update_time=1734936040619, lifecycle_stage='active', name='pollution_dataset_experiment', tags={}>

In [10]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [11]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [14]:
with mlflow.start_run():
    # Treinando o modelo
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculando métricas
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Registrando parâmetros, métricas e o modelo
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Registrando o modelo
    mlflow.sklearn.log_model(model, "classification_rf_model",input_example=X_test)



🏃 View run bright-croc-627 at: https://dagshub.com/AurelioGuilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/0/runs/1e1779c77379478cae051430898af97c
🧪 View experiment at: https://dagshub.com/AurelioGuilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/0
