# I am going to use the Linear Regression project

In [46]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [47]:
df['sex_n'] = pd.factorize(df["sex"])[0]
df["smoker_n"] = pd.factorize(df["smoker"])[0]
df['region_n'] = pd.factorize(df["region"])[0]


In [48]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

num_variables = ['age','sex_n', 'smoker_n', 'children', 'bmi', 'region_n']


# Dividimos el conjuno de datos en entrenamiento (train) y pruebas (test)
X = df.drop('charges', axis = 1)[num_variables]
y = df['charges']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 12)

selection_model = SelectKBest(f_regression, k = 6)
selection_model.fit(X_train, y_train)

selected_columns = X_train.columns[selection_model.get_support()]
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = selected_columns)
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = selected_columns)
X_train_sel.head()


Unnamed: 0,age,sex_n,smoker_n,children,bmi,region_n
0,18.0,0.0,0.0,0.0,36.85,1.0
1,30.0,0.0,1.0,0.0,27.93,3.0
2,48.0,1.0,1.0,0.0,29.7,1.0
3,18.0,1.0,0.0,0.0,38.17,1.0
4,19.0,0.0,0.0,0.0,21.7,0.0


In [49]:
X_train_sel.to_csv("/workspaces/streamlit/data/raw/medical_train_X.csv", index=False)
with open("/workspaces/streamlit/data/raw/medical_train_y.txt", "w") as f:
    f.write(y_train.to_string(index=False))
X_test_sel.to_csv("/workspaces/streamlit/data/raw/medical_test_X.csv", index=False)
with open("/workspaces/streamlit/data/raw/medical_test_y.txt", "w") as f:
    f.write(y_test.to_string(index=False))


In [50]:
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [51]:
model = LinearRegression()
model.fit(X_train_sel, y_train)

In [52]:
print(f"Intercep (a): {model.intercept_}")
print(f"Coefficients (b1, b2): {model.coef_}")

Intercep (a): 10892.731940032498
Coefficients (b1, b2): [   262.26352431    -58.25976836 -23840.06610383    407.98188941
    336.41344227    201.71936286]


In [53]:
y_pred = model.predict(X_test_sel)
y_pred

array([ 9958.37671059,  5929.3008168 , 30722.21722942,  6076.52947368,
        1636.85073739,  1987.40954363,  2904.94825198, 17669.79291291,
        9041.21729922, 32851.7112547 ,  9395.70001923, 11289.37439637,
        9059.77080294,  3333.25526849,  4557.056335  ,  3059.08463449,
        7171.58513028, 12117.53074454,  3532.35504066, 11906.94020978,
       10081.57288092,  5491.13692331, 13314.26489975, 11928.59105559,
       28365.24912734,   345.08482203, 13722.51274797, 13291.77430449,
        7477.16673699, 12664.0533694 ,  8140.75549852,  4599.08954089,
       32837.83579473, 35967.93775839,  9036.29090184,  5665.72528806,
        6986.9758359 ,  9748.77091036,  7543.89066157, 39993.1799766 ,
        3139.20442027,  5876.76207194, 14710.4343623 , 32627.76915867,
        2698.03944568,  8162.58867656,  9971.46746665, 31856.35888687,
       16500.41128016,  9749.02403422, 13285.44429609, 33628.62598717,
        8303.08361398, 34228.11041481, 16360.95198469,  2102.19299155,
      

In [54]:
from sklearn.metrics import mean_squared_error, r2_score
import math

mse_sqrt = math.sqrt(mean_squared_error(y_test, y_pred))


print(f"Raíz cuadrada del MSE: {mse_sqrt}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")

Raíz cuadrada del MSE: 6466.485779974646
R2 Score: 0.709638040527346


In [55]:
from pickle import dump

dump(model, open("/workspaces/streamlit/models/LinearRegression.sav", "wb"))


In [56]:
import pickle 

dump(model, open("/workspaces/streamlit/models/LinearRegression.sav", "wb"))

with open('/workspaces/streamlit/models/scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)