# Explore here

In [None]:
# Your code here
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from pickle import dump

In [None]:
# Your code here
medical_in = pd.read_csv('/workspace/machine-learning-streamlit/data/raw/medical_insurance_cost.csv')

In [None]:
medical_in.head()

In [None]:
medical_in.info()

In [None]:
#Categorical
fig, axis = plt.subplots(1, 3, figsize = (18, 7))

sns.histplot(ax = axis[0], data = medical_in, x = "sex", color='navy')
sns.histplot(ax = axis[1], data = medical_in, x = "smoker", color='green').set_ylabel(None)
sns.histplot(ax = axis[2], data = medical_in, y = "region", color='red')

plt.show()

In [None]:
#Numeric
fig, axis = plt.subplots(2, 3, figsize = (15, 7))

sns.histplot(ax = axis[0, 0], data = medical_in, x = "age", color='navy').set_xlabel(None)
sns.boxplot(ax = axis[1, 0], data = medical_in, x = "age", color='navy')
sns.histplot(ax = axis[0, 1], data = medical_in, x = "children", color='green').set_xlabel(None)
axis[0, 1].set_ylabel(None)
sns.boxplot(ax = axis[1, 1], data = medical_in, x = "children", color='green')
sns.histplot(ax = axis[0, 2], data = medical_in, x = "charges", color='red').set_xlabel(None)
axis[0, 2].set_ylabel(None)
sns.boxplot(ax = axis[1, 2], data = medical_in, x = "charges", color='red')

plt.tight_layout()
plt.show()

In [None]:
medical_in.columns

In [None]:
#Numeric-numeric
fig, axis = plt.subplots(2, 4, figsize = (13, 6))

sns.regplot(ax = axis[0, 0], data = medical_in, x = "age", y = "charges", color='green')
sns.heatmap(medical_in[["age", "charges"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)

sns.regplot(ax = axis[0, 1], data = medical_in, x = "bmi", y = "charges", color='green').set_ylabel(None)
sns.heatmap(medical_in[["bmi", "charges"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 1], cbar = False)

sns.regplot(ax = axis[0, 2], data = medical_in, x = "age", y = "bmi", color='red')
sns.heatmap(medical_in[["age", "bmi"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 2], cbar = False)

sns.regplot(ax = axis[0, 3], data = medical_in, x = "children", y = "charges", color='navy')
sns.heatmap(medical_in[["children", "charges"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 3], cbar = False)

plt.tight_layout()
plt.show()

In [None]:
#Categorical-categorical
fig, axis = plt.subplots(1, 3, figsize = (12, 6))

sns.countplot(ax = axis[0], data = medical_in, x = "smoker", hue = "sex")
sns.countplot(ax = axis[1], data = medical_in, x = "region", hue = "smoker").set_ylabel(None)
sns.countplot(ax = axis[2], data = medical_in, x = "region", hue = "sex").set_ylabel(None)


plt.tight_layout()
plt.show()

In [None]:
fig, axis = plt.subplots(figsize = (14, 7), ncols = 3)

sns.barplot(ax = axis[0], data = medical_in, x = "sex", y = "charges", hue = "smoker")
sns.barplot(ax = axis[1], data = medical_in, x = "children", y = "charges", hue = "smoker")
sns.barplot(ax = axis[2], data = medical_in, y = "charges", x = "smoker", hue = "region")

plt.tight_layout()

plt.show()

In [None]:
#factorize
medical_in['sex_n'] = pd.factorize(medical_in['sex'])[0]
medical_in['smoker_n'] = pd.factorize(medical_in['smoker'])[0]
medical_in['region_n'] = pd.factorize(medical_in['region'])[0]
medical_in.head()

In [None]:
smoker_described = medical_in.set_index('smoker_n')['smoker'].to_dict()
region_described = medical_in.set_index('region_n')['region'].to_dict()
sex_described = medical_in.set_index('sex_n')['sex'].to_dict()
region_described

In [None]:

#correlacion

fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(medical_in[['age','sex_n', 'smoker_n', 'region_n', 'charges', 'children', 'bmi']].corr(), annot = True, fmt = ".2f",linecolor='white',linewidths=1)

plt.tight_layout()

plt.show()



In [None]:
fig, axis = plt.subplots(2, 1, figsize = (7, 7))
sns.regplot(data = medical_in, x = "smoker_n", y = "charges", ax = axis[0])
sns.heatmap(medical_in[["charges", "smoker_n"]].corr(), annot = True, fmt = ".2f", ax = axis[1], cbar = False)

In [None]:
medical_in.columns

In [None]:
fig, axis = plt.subplots(2, 4, figsize = (15, 10))

sns.boxplot(ax = axis[0, 0], data = medical_in, y = "age")
sns.boxplot(ax = axis[0, 1], data = medical_in, y = "sex_n")
sns.boxplot(ax = axis[0, 2], data = medical_in, y = "bmi")
sns.boxplot(ax = axis[0, 3], data = medical_in, y = "children")
sns.boxplot(ax = axis[1, 0], data = medical_in, y = "smoker_n")
sns.boxplot(ax = axis[1, 1], data = medical_in, y = "region_n")
sns.boxplot(ax = axis[1, 2], data = medical_in, y = "charges")

In [None]:
medical_in.isnull().sum()

In [None]:
medical_in.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

num_variables = ['age','sex_n', 'smoker_n', 'children', 'bmi', 'region_n']


# Dividimos el conjuno de datos en entrenamiento (train) y pruebas (test)
X = medical_in.drop('charges', axis = 1)[num_variables]
y = medical_in['charges']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)

selection_model = SelectKBest(f_regression, k = 6)
selection_model.fit(X_train, y_train)

selected_columns = X_train.columns[selection_model.get_support()]
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = selected_columns)
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = selected_columns)
X_train_sel.head()

In [None]:
X_train_sel["charges"] = y_train.values
X_test_sel["charges"] = y_test.values

X_train_sel.to_csv('/workspace/machine-learning-streamlit/data/processed/clean_train.csv', index = False)
X_test_sel.to_csv('/workspace/machine-learning-streamlit/data/processed/clean_test.csv', index = False)

In [None]:
train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

train_data.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()

scaler = StandardScaler()

# Ajusta el objeto a los datos de entrenamiento
scaler.fit(X_train)

# Normaliza los datos de entrenamiento y prueba
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [None]:
X_train = train_data.drop(["charges"], axis = 1)
y_train = train_data["charges"]
X_test = test_data.drop(["charges"], axis = 1)
y_test = test_data["charges"]

X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = selected_columns)
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = selected_columns)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
print(f"Intercep (a): {model.intercept_}")
print(f"Coefficients (b1, b2): {model.coef_}")

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import math

mse_sqrt = math.sqrt(mean_squared_error(y_test, y_pred))

# Imprime el resultado
print(f"Raíz cuadrada del MSE: {mse_sqrt}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")

In [None]:
y_test

In [None]:
dump(model, open("/workspace/machine-learning-streamlit/models/LinearRegression_.sav", "wb"))

In [None]:
with open('/workspace/machine-learning-streamlit/models/scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)