In [None]:
import piplite
await piplite.install(['pandas'])
await piplite.install(['matplotlib'])
await piplite.install(['scipy'])
await piplite.install(['scikit-learn'])
await piplite.install(['seaborn'])
await piplite.install(['numpy'])

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
from pyodide.http import pyfetch

async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

In [None]:
filepath = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/medical_insurance_dataset.csv'
await download(filepath, "insurance.csv")
file_name="insurance.csv"
pd.read_csv(file_name)

In [None]:
df = pd.read_csv(file_name, header=None)
print(df.head(10))

In [None]:
headers = ["age", "gender", "bmi", "no_of_children", "smoker", "region", "charges"]
df.columns = headers

In [None]:
df.replace("?", np.nan, inplace= True)

In [None]:
print(df.info())
#Smoker is  categoical, so replace with most frequent value
is_smoker = df["smoker"].value_counts().idxmax()
df["smoker"].replace(np.nan, is_smoker, inplace = True)

#Age is numerical, so replace with mean
mean_age =df["age"].astype("float").mean(axis=0)
df["age"].replace(np.nan, mean_age, inplace=True)

#Convert to appropriate data types
df[["age","smoker"]] = df[["age","smoker"]].astype("int")
print(df.info())

In [None]:
#Charge was have more than two decimal except others. Rounded to 2 decimal places.
df[["charges"]] = np.round(df["charges"],2)
print(df.head())

In [None]:
#Visualize relationship between "bmi" and "insurance" charges
sns.regplot(x="bmi", y="charges", data=df, line_kws={"color": "red"})
plt.ylim(0,)

In [None]:
#Compare charges between smokers and non-smokers
sns.boxplot( x= "smoker", y="charges", data=df)

In [None]:

#Calculate correlation to identify relationships
print(df.corr())

In [None]:
# Simple Linear Regression: Smoker as single predcitor
X = df[["smoker"]]
Y = df[["charges"]]
lm = LinearRegression()
lm.fit(X,Y)
print(lm.score(X,Y))

In [None]:
#Multiple Linear Regression: Using all features
Z= df[["age", "gender", "bmi", "no_of_children", "smoker", "region"]]
lm.fit(Z,Y)
print(lm.score(Z,Y)) #Print R^2 score for multiple regression

In [None]:
#Create pipeline with scaling, polynomial features, and linear regression
Input = [("scale",StandardScaler()), ("polynomial", PolynomialFeatures(include_bias = False)), ("model", LinearRegression())]
pipe = Pipeline(Input)
Z = Z.astype(float)
pipe.fit(Z,Y)
ypipe = pipe.predict(Z) 
print(r2_score(Y,ypipe)) #Evaluate pipeline performance

In [None]:
#Split data into training and testing sets(80-20 split)
x_train, x_test, y_train, y_test = train_test_split(Z, Y, test_size = 0.2, random_state=1)

In [None]:
#Train Ridge regression model to prevent overfitting
RidgeModel = Ridge(alpha=0.1)
RidgeModel.fit(x_train, y_train)
Yhat = RidgeModel.predict(x_test)
print(r2_score(y_test, Yhat)) #Evaluate model on test set

In [None]:
#Apply polymomial Features with Ridge Regression
pr = PolynomialFeatures(degree=2)
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.transform(x_test)
RidgeModel.fit(x_train_pr, y_train)
y_hat = RidgeModel.predict(x_test_pr)
print(r2_score(y_test,y_hat)) #Evaluate polynomial Ridge model