# Código Python - Trabalho 2

## *Imports* estáticos

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

## 4.1  Análise Exploratória de Dados
### 4.1.1   Leitura do ficheiro

In [3]:
dados = pd.read_csv('../../dados/AIRPOL_data.csv', delimiter=";", header=0, decimal=',')
dados = dados.drop(columns=['Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']).rename(columns={"Value":"Premature_Deaths"})
#dados = dados[dados.Country != 'All Countries']
#dados.sort_values(by=['Value'])

### 4.3.1 Novo atributo "RespDisease"

Este código foi inspirado por esta [thread](https://stackoverflow.com/questions/67420811/a-new-column-in-pandas-which-value-depends-on-other-columns) no stack**overflow**

In [14]:
goalAttrib = 'RespDisease'

def calc_resp_disease(row):
    respDiseases = ['Asthma', 'Chronic obstructive pulmonary disease']
    if row['Outcome'] in respDiseases:
        return 1
    else:
        return 0

dados[goalAttrib] = dados.apply(calc_resp_disease, axis=1)

dados

Unnamed: 0,Country,NUTS_Code,Air_Pollutant,Outcome,Affected_Population,Populated_Area[km2],Air_Pollution_Average[ug/m3],Premature_Deaths,RespDisease
0,Albania,AL,NO2,Asthma,2337443.0,11299.0,5.5,103.0,1
1,Albania,AL,PM2.5,Asthma,456076.0,11299.0,11.3,231.0,1
2,Albania,AL0,NO2,Asthma,2337443.0,11299.0,5.5,103.0,1
3,Albania,AL0,PM2.5,Asthma,456076.0,11299.0,11.3,231.0,1
4,Albania,AL03,PM2.5,Asthma,149508.0,4041.5,11.3,69.0,1
...,...,...,...,...,...,...,...,...,...
49135,Turkey,TR822,NO2,Stroke,213615.0,7473.5,9.0,3.0,0
49136,Turkey,TRB12,NO2,Stroke,447268.0,8860.4,12.4,291.0,0
49137,Turkey,TRB14,NO2,Stroke,76591.0,5506.6,12.0,17.0,0
49138,Turkey,TRB21,NO2,Stroke,696344.0,17511.1,11.2,25.0,0


### 4.3.2 K-Fold cross validation

#### Preparação dos valores

In [20]:
features = list(dados.columns[0:9])
#print(features)
numericFeatures = features[4:]
#print(numericFeatures)
scaler = MinMaxScaler()

X = dados[numericFeatures].drop(columns=[goalAttrib])
y = dados[goalAttrib]

#### Otimização dos parâmetros da Árvore de regressão

In [21]:
tree_params = {
    'dtr__max_depth': [4, 6, 8, None],
    'dtr__min_samples_split': [2, 5, 10],
    'dtr__min_samples_leaf': [1, 2, 4]
}
tree_pipeline = Pipeline([
    ('scaler', scaler),
    ('dtr', DecisionTreeRegressor(random_state=42))
])
tree_grid = GridSearchCV(tree_pipeline, tree_params, cv=3,
                         scoring='neg_mean_squared_error', n_jobs=-1)
tree_grid.fit(X, y)

#----------------------------

#plt.figure(figsize=(20, 12))  

#plot_tree(
#    tree_grid.best_estimator_.named_steps['dtr'],
#    feature_names=X.columns,
#    filled=True,
#    rounded=True,
#    fontsize=12,
#    proportion=False,
#    impurity=False,
#    precision=2
#)

#plt.title("Árvore de Regressão Otimizada", fontsize=16)
#plt.tight_layout()
#plt.show()

#### Otimização do kernel SVM

In [None]:
svm_pipeline = Pipeline([
    ('scaler', scaler),
    ('svm', SVR())
])

svm_params = {
    'svm__kernel': ['rbf'],
    'svm__C': [1, 10],
    'svm__epsilon': [0.1, 0.2]
}
# ?????????????????????????????????????
svm_grid = GridSearchCV(svm_pipeline, svm_params, cv=5,
                        scoring='neg_mean_squared_error', n_jobs=-1)
svm_grid.fit(X, y)

: 

#### Otimização da configuração da rede neuronal

In [None]:
mlp_params = {
    'mlp__hidden_layer_sizes': [(50,), (100,)],
    'mlp__activation': ['relu'],
    'mlp__solver': ['adam'],
    'mlp__learning_rate': ['constant']
}
mlp_pipeline = Pipeline([
    ('scaler', scaler),
    ('mlp', MLPRegressor(max_iter=5000, early_stopping=True, random_state=42))
])
mlp_grid = GridSearchCV(mlp_pipeline, mlp_params, cv=3,
                        scoring='neg_mean_squared_error', n_jobs=-1)
mlp_grid.fit(X, y)

#### k-fold cross validation

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

models = []
models.append(('rgr', Pipeline([
    ('scaler', scaler),
    ('lr', LinearRegression())
])))
models.append(('dtr', tree_grid.best_estimator_))
models.append(('net', mlp_grid.best_estimator_))
models.append(('svm', svm_grid.best_estimator_))

lstresults = []

for name, model in models:
    cv_scores = cross_val_score(model, X, y, cv=kfold,
                                scoring='neg_mean_squared_error', n_jobs=-1)
    rmse = (-cv_scores) ** 0.5
    lstresults.append(pd.Series({
        'model': name,
        'mean_RMSE': round(np.mean(rmse), 3),
        'std_RMSE': round(np.std(rmse), 3)
    }))

resdf = pd.DataFrame(lstresults)
resdf

Unnamed: 0,model,mean_RMSE,std_RMSE
0,rgr,1973.718,352.331
1,dtr,1892.906,291.569
2,net,1974.829,433.163
3,svm,2289.983,511.069
