In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/insurance/insurance.csv


In [None]:
import matplotlib.pyplot as plt

In [None]:
# Reading the CSV file
df = pd.read_csv("/kaggle/input/insurance/insurance.csv")
df.head()

In [None]:
# to understand the type of the variables
df.info()

In [None]:
# some statistics about the dataset
df.describe()

In [None]:
# what are the regions that we have in our dataset
df.groupby("region")["age"].count()

In [None]:
# changing the string variables into numerical to be prepared for the model

df_bin = df.copy()

# un the sex column, a male get 1 and a female get 0
df_bin["sex_bin"] = df_bin["sex"].map({"male": 1, "female": 0})

# in the smoker column, a yes is 1 and a no is 0
df_bin["smoker_bin"] = df_bin["smoker"].map({"yes": 1, "no": 0})

# every region got a number between 0 and 3 (we have 4 regions in total)
df_bin["region_bin"] = df_bin["region"].map({"northeast": 0, "northwest": 1, "southeast": 2, "southwest": 3})
df_bin.head()

In [None]:
# removing the columns sex, smoker and region beacuse we have the numerical ones
df_bin.drop(["sex", "smoker", "region"], inplace=True, axis=1)
df_bin.head()

In [None]:
# changing the order of the variables
df_bin = df_bin[['age', 'bmi', 'children', 'sex_bin', 'smoker_bin',
       'region_bin', 'charges']]
df_bin.head()

In [None]:
w = 0 # Weight
b = 0 # Bias
L = 0.0001 # Learning rate

In [None]:
# The algorithm relies on these two equations:
# w = w - L * derivative_J_w
# b = b - L * derivative_J_b

mse_dict = {} # The key and the value of each element will be respectively the feature and its cost function
param_dict = {} # The key and the value of each element will be respectively the weight and its bias

for column in df_bin[['age', 'bmi', 'children', 'sex_bin', 'smoker_bin', 'region_bin']]:
    
    # let x: BMI and y: Charges
    # the cost function is defined as J(w, b) = (1 / n) * sigma(yi - w * xi - b) where sigma ranges from 1 to 1338
    def cost_function(w, b, df_bin):
        n = len(df_bin)
        J = 0
        for i in range(n):
            x = df_bin.iloc[i][column]
            y = df_bin.iloc[i]["charges"]
        
            J += (y - w * x - b) ** 2
    
        return (1 / n) * J
    
    # The algorithm relies on these two equations:
    # w = w - L * derivative_J_w
    # b = b - L * derivative_J_b
    def gradient_descent(w, b, L, df_bin):
        n = len(df_bin)
        der_J_w = 0
        der_J_b = 0
    
        for i in range(n):
            x = df_bin.iloc[i][column]
            y = df_bin.iloc[i]["charges"]
        
            der_J_w += (-2 / n) * x * (y - w * x - b)
            der_J_b += (-2 / n) * (y - w * x - b)
    
        w_final = w - L * der_J_w
        b_final = b - L * der_J_b
    
        return w_final, b_final

    epochs = 10

    for i in range(epochs):
        w, b = gradient_descent(w, b, L, df_bin)
    
    param_dict[column] = (w, b)
    
    # erreur between the dataset and the predected values
    mse = cost_function(w, b, df_bin)
    mse_dict[column] = mse
print(mse_dict)
print(param_dict)

In [None]:
# the feature returned at the end is gonna be the best choice for our model

min_value = mse_dict["age"]
for value in mse_dict.values():
    if min_value > value:
        min_value = value
        
for cle in mse_dict:
    if mse_dict[cle] == min_value:
        feature = cle
        print(f"The column with the lowest cost function is {cle}.")
        break

In [None]:
# Visualizing the model
plt.scatter(df[feature], df.charges, color = "black")
plt.xlabel(feature)
plt.ylabel("Charges")
plt.title("Forecasting medical costs through Linear Regression")

plt.plot(list(df_bin[feature]), [param_dict[feature][0] * x + param_dict[feature][1] for x in list(df_bin[feature])], color = "red")

plt.show()