In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:

data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv")


In [7]:
data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
### JUST THE NUMERICAL DATA FIRST ###

# split numerical and categorical

# these will need to be scaled / standardized
num_cols = ['age',
            'bmi',
            'children',
            ]

# we'll dummy code these later
cat_cols = ['sex',
            'smoker',
            'region',
]


# break off the numerical columns just to start
X = data[num_cols]
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# scale the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# scale the test data
X_test_scaled = scaler.transform(X_test)

# instantiate model
model = LinearRegression()

# fit model
model.fit(X_train_scaled, y_train)

# use fitted model to make predictions on the test set
y_test_pred = model.predict(X_test_scaled)

# evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

MSE: 131201335.64669806
R2 Score: 0.15489592484270753


In [6]:
### NOW WITH THE CATEGORICAL DATA INCLUDED ###

# separate features and target variable
X = data[num_cols + cat_cols]
y = data['charges']

# convert categorical variables to dummy variables
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the numerical feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[num_cols])
X_test_scaled = scaler.transform(X_test[num_cols])

# create new DataFrames for the scaled numerical data
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=num_cols, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=num_cols, index=X_test.index)

# concatenate the scaled numerical data with the dummy-coded categorical data
X_train_final = pd.concat([X_train_scaled_df, X_train.drop(columns=num_cols)], axis=1)
X_test_final = pd.concat([X_test_scaled_df, X_test.drop(columns=num_cols)], axis=1)

# instantiate the model
model = LinearRegression()

# fit the model
model.fit(X_train_final, y_train)

# make predictions
y_pred = model.predict(X_test_final)

# evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

MSE: 33596915.85136148
R2 Score: 0.7835929767120722
