<a href="https://colab.research.google.com/github/Alissongrs/RegressaoLinear/blob/main/RNA_e_REGRESS%C3%83O_SAUDE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings("ignore")

# Bibliotecas para uso e visualização de dados
import numpy as pd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Imports para seleção de modelos
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Imports para preparação de dados
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Imports para auxiliar a visualização de dados descritos em várias variáveis
from sklearn.decomposition import PCA

# Import de modelos preditivos
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier

# Import de métricas
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("/content/insurance.csv")
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1048,25,female,22.515,1,no,northwest,3594.17085
1018,54,female,35.815,3,no,northwest,12495.29085
507,21,male,23.75,2,no,northwest,3077.0955
843,57,female,29.81,0,yes,southeast,27533.9129
918,61,female,28.2,0,no,southwest,13041.921


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.shape

(1338, 7)

In [5]:
df.nunique()

Unnamed: 0,0
age,47
sex,2
bmi,548
children,6
smoker,2
region,4
charges,1337


In [6]:
df.drop_duplicates(inplace=True)
df.shape

(1337, 7)

In [7]:
# Convertendo dados "string" para "números"
df["genero"] = df["sex"].apply( lambda x: 0 if x == "male" else 1 )
df["fumante"] = df["smoker"].apply( lambda x: 1 if x == "yes" else 0 )

# OneHotEncoder da coluna "region"
aux = pd.get_dummies(df["region"], drop_first=True)
df_final = pd.concat([df, aux], axis=1)

# Remover colunas que já foram processadas
df_final.drop(columns=["sex", "smoker", "region"], inplace=True)
df_final

Unnamed: 0,age,bmi,children,charges,genero,fumante,northwest,southeast,southwest
0,19,27.900,0,16884.92400,1,1,False,False,True
1,18,33.770,1,1725.55230,0,0,False,True,False
2,28,33.000,3,4449.46200,0,0,False,True,False
3,33,22.705,0,21984.47061,0,0,True,False,False
4,32,28.880,0,3866.85520,0,0,True,False,False
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,0,0,True,False,False
1334,18,31.920,0,2205.98080,1,0,False,False,False
1335,18,36.850,0,1629.83350,1,0,False,True,False
1336,21,25.800,0,2007.94500,1,0,False,False,True


In [8]:
# Separação de DADOS e LABEL
X = df_final.drop(columns=["charges"])
y = df_final["charges"]

# Dividindo dados para TREINO e TESTE
X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.3,
  random_state=42
)

# Padronização dos dados
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [9]:
# Treinando o modelo
modelo = LinearRegression()
modelo.fit(X_train_scaled, y_train)

# Fazendo as predições
y_pred = modelo.predict(X_test_scaled)

In [10]:
def metricas(X_tr_scaled, y_tr, y_ts, y_pr, model, id_modelo):

 y_pr_tr = model.predict(X_tr_scaled)

 # Erro quadrático médio
 print(f"MSE do TREINO ({id_modelo}): ", mean_squared_error(y_tr, y_pr_tr))
 print(f"MSE do TESTE ({id_modelo}): ", mean_squared_error(y_ts, y_pr))

 # Erro absoluto médio
 print(f"MAE do TREINO ({id_modelo}): ", mean_absolute_error(y_tr, y_pr_tr))
 print(f"MAE do TESTE ({id_modelo}): ", mean_absolute_error(y_ts, y_pr))

 # R²
 print(f"R² do TREINO ({id_modelo}): ", r2_score(y_tr, y_pr_tr))
 print(f"R² do TESTE ({id_modelo}): ", r2_score(y_ts, y_pr))

In [11]:
# Metricas para regressão linear
metricas(X_train_scaled, y_train, y_test, y_pred, modelo, "Reg Linear")

MSE do TREINO (Reg Linear):  35812933.23617429
MSE do TESTE (Reg Linear):  38940169.922826335
MAE do TREINO (Reg Linear):  4165.830851177667
MAE do TESTE (Reg Linear):  4181.815956942305
R² do TREINO (Reg Linear):  0.7362637207024127
R² do TESTE (Reg Linear):  0.7724363518631285


In [13]:
# Treinando uma RNA padrão e realizando predições
modelo_rn = MLPRegressor(max_iter=1000)
modelo_rn.fit(X_train_scaled, y_train)
y_pred_rn = modelo_rn.predict(X_test_scaled)

# Metricas para RNA
metricas(X_train_scaled, y_train, y_test, y_pred_rn, modelo_rn, "RN")

MSE do TREINO (RN):  141257801.11118123
MSE do TESTE (RN):  173531443.61674082
MAE do TREINO (RN):  8175.820754362697
MAE do TESTE (RN):  8868.448883464867
R² do TREINO (RN):  -0.040261255372145976
R² do TESTE (RN):  -0.014105702521217012


In [14]:
# Otimização de hiperparâmetros para melhorar
# o resultado da predição via MLPRegressor

# Parâmetros a serem experimentados
parameters = {
  'hidden_layer_sizes': [(5,), (10,), (10, 5,), (20, 10,)],
  'activation': ['relu', 'logistic', 'tanh'],
  'solver': ['adam', 'lbfgs'],
  'alpha': [0.0001, 0.05],
  'learning_rate': ['constant', 'adaptive'],
}

# Construção e treino do grid
clf = GridSearchCV(modelo_rn, parameters, cv=5,
         refit=True, return_train_score=True)
clf.fit(X_train_scaled, y_train)

# Melhor modelo encontrado
print("Melhores parâmetros:")
print(clf.best_params_)

# Avaliação do melhor modelo encontrado
modelo_rn_otimizado = clf.best_estimator_
y_pred_rn_otimizado = modelo_rn_otimizado.predict(X_test_scaled)
metricas(X_train_scaled, y_train, y_test,
    y_pred_rn_otimizado, modelo_rn_otimizado, "RN OTIMIZADO")

Melhores parâmetros:
{'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive', 'solver': 'lbfgs'}
MSE do TREINO (RN OTIMIZADO):  19792916.99880084
MSE do TESTE (RN OTIMIZADO):  26855524.608402465
MAE do TREINO (RN OTIMIZADO):  2730.983592076996
MAE do TESTE (RN OTIMIZADO):  3089.6106021994256
R² do TREINO (RN OTIMIZADO):  0.8542395214799966
R² do TESTE (RN OTIMIZADO):  0.8430581796476654
