# Previsão de Custos Médicos com Regressão Linear
Projeto de pós-graduação - Tech Challenge Fase 1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.api as sm
import numpy as np

## 1. Carregamento da Base de Dados

In [None]:
df = pd.read_csv('simulated_insurance_100k.csv')
df.head()

## 2. Exploração dos Dados

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['charges'], bins=30, kde=True)
plt.title('Distribuição dos custos médicos (charges)')
plt.xlabel('charges')
plt.ylabel('Frequência')
plt.tight_layout()
plt.show()

## 3. Pré-processamento

In [None]:
data = df.copy()
label_encoders = {}
for col in ['sex', 'smoker', 'region']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
X = data.drop('charges', axis=1)
y = data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Modelagem com Regressão Linear

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## 5. Avaliação do Modelo

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R²: {r2:.4f}')

In [None]:
X_train_sm = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_sm).fit()
ols_model.summary()

## 6. Visualização: Reais vs Previstos

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title('Valores reais vs Previstos')
plt.xlabel('Valores reais')
plt.ylabel('Valores previstos')
plt.tight_layout()
plt.show()