In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns


In [None]:

try:
    df=pd.read_csv('insurance.csv')
except Exception as e:
    print('Error : {e}')


In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#000000'

In [None]:
df.describe()

In [None]:
fig=px.histogram(
                    df,
                    x='age',
                    marginal='box',
                    nbins=47,
                    title='Distribution of Age'
    )
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig=px.histogram(
                    df,
                    x='bmi',
                    marginal='box',
                    color_discrete_sequence=['red'],
                    title='BMI Distribution'
)
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig=px.histogram(
                    df,
                    x='charges',
                    marginal='box',
                    color='smoker',
                    color_discrete_sequence=['green','grey'],
                    title='Annual Medical Charges'
)
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
df.smoker.value_counts()

In [None]:
px.histogram(
                df,
                x='smoker',
                color='sex',
                title='Gender based distribution'
)

In [None]:
fig=px.scatter( 
                df,
                x='age',
                y='charges',
                color='smoker',
                opacity=0.8,
                hover_data=['sex'],
                title='Age vs Charges'
)
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig=px.scatter(
                df,
                x='bmi',
                y='charges',
                color='smoker',
                opacity=0.8,
                hover_data=['sex'],
                title='BMI and Charges'
)
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig=px.violin(
                df,
                x='children',
                y='charges',
                title='Children and Charges'
)
fig.show()

In [None]:
df.charges.corr(df.age)

In [None]:
df.charges.corr(df.bmi)

In [None]:
s_values={'yes':1,'no':0}
s_num=df.smoker.map(s_values)
s_num

In [None]:
df.charges.corr(s_num)

In [None]:
non_smoker=df[df['smoker']=='no']


In [None]:
plt.title('Age vs Charges')
sns.scatterplot(
                    df,
                    x='age',
                    y='charges',
                    alpha=0.7,
                    s=15
)

In [None]:
# For Linear Regression we try to fit such a line which is the closest tot all the other points.
#The formula used here is : y=w*x+b (where y is the variable dependent on x , w = weight , and b = bias/intercept)
def estimate_charges(age,w,b):
    return age*w+b

In [None]:
w=50
b=100

In [None]:
estimate_charges(30,w,b)

In [None]:
ages=non_smoker.age
ages

In [None]:
estimated_charges=estimate_charges(ages,w,b)
estimated_charges

In [None]:
non_smoker.charges

In [None]:
plt.plot(ages,estimated_charges,color='red')
plt.xlabel('age')
plt.ylabel('estimated_charges')
plt.show()

In [None]:
target=non_smoker.charges
plt.plot(ages,estimated_charges,'r',alpha=0.90)
plt.scatter(ages,target,s=8,alpha=0.8)
plt.xlabel('Ages')
plt.ylabel('Charges')
plt.legend(['Estimate','Actual Charges'])

In [None]:
def try_paramters(w,b):
    ages=non_smoker.age
    estimated_charges=estimate_charges(ages,w,b)
    
    plt.plot(ages,estimated_charges,'r',alpha=0.90)
    plt.scatter(ages,target,s=8,alpha=0.8)
    plt.xlabel('Ages')
    plt.ylabel('Charges')
    plt.legend(['Estimate','Actual Charges'])

In [None]:
try_paramters(60,200)

In [None]:
try_paramters(150,25)

In [None]:
try_paramters(400,-2000)

In [None]:
target=non_smoker.charges
target

In [None]:
predictions=estimated_charges
predictions 

In [None]:
def rmse(target,predictions):
    return np.sqrt(np.mean(np.square(target-predictions)))

In [None]:
w=50
b=100

In [None]:
try_paramters(50,100)

In [None]:
target=non_smoker.charges
predicted=estimate_charges(non_smoker.age,w,b)

In [None]:
rmse(target,predicted)
#Each element differs by $8461

In [None]:
def try_paramters(w,b):
    ages=non_smoker.age
    target=non_smoker.charges
    estimated_charges=estimate_charges(ages,w,b)
    
    plt.plot(ages,estimated_charges,'r',alpha=0.90)
    plt.scatter(ages,target,s=8,alpha=0.8)
    plt.xlabel('Ages')
    plt.ylabel('Charges')
    plt.legend(['Estimate','Actual Charges'])
    loss=rmse(target,estimated_charges)
    print(f'THIS IS THE RMSE/LOSS VALUE : {loss}')

In [None]:
try_paramters(300,-5000)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model=LinearRegression()

In [None]:
input=non_smoker[['age']] # WE USE DOUBLE BRACKETS CAUSE WE CAN FIT MULTIPLE PARAMETERS FOR LINE FITTING
target=non_smoker.charges

In [None]:
model.fit(input,target)

In [None]:
model.predict(np.array([[23],[37],[61]]))

In [None]:
predictions=model.predict(input)
predictions

In [None]:
target

In [None]:
rmse(target,predictions)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
inputs,target=non_smoker[['age']],non_smoker['charges']

model=LinearRegression()

model.fit(inputs,target)

predictions = model.predict(inputs)

loss=rmse(target,predictions)

print(f'Loss is {loss}')

In [None]:
#Plane as there are two weights
#Bmi didnt create a major impact

inputs,target=non_smoker[['age','bmi']],non_smoker['charges']

model=LinearRegression()

model.fit(inputs,target)

predictions = model.predict(inputs)

loss=rmse(target,predictions)

print(f'Loss is {loss}')

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
non_smoker.charges.corr(non_smoker['children'])

In [None]:
inputs,target=non_smoker[['age','bmi','children']],non_smoker['charges']

model=LinearRegression()

model.fit(inputs,target)

predictions = model.predict(inputs)

loss=rmse(target,predictions)

print(f'Loss is {loss}')

In [None]:
inputs,target=df[['age','bmi','children']],df['charges']

model=LinearRegression()

model.fit(inputs,target)

predictions = model.predict(inputs)

loss=rmse(target,predictions)

print(f'Loss is {loss}')

In [None]:
sns.barplot(data=df,x='smoker',y='charges')

In [None]:
smoker_codes={'no':0,'yes':1}
df['smoker_codes']=df.smoker.map(smoker_codes)


In [None]:
df

In [None]:
df.charges.corr(df.smoker_codes)

In [None]:
inputs,target=[['age','bmi','children','smoker_codes']],df['charges']

model=LinearRegression().fit(inputs,target)

predictions=model.predict(inputs)

loss=rmse(target,predictions)

loss

In [None]:
#For Doing the same process as done for smoker codes when done for the gender column the correlation is really low and negigible for vast data analysis.

smoker_codes={'female':0,'male':1}
df['sex_codes']=df.sex.map(smoker_codes)

In [None]:
from sklearn import preprocessing

enc=preprocessing.OneHotEncoder()

enc.fit(df[['region']])
enc.categories_

In [None]:
one_hot=enc.transform(df[['region']]).toarray()
one_hot

In [None]:
df[['northeast', 'northwest', 'southeast', 'southwest']]=one_hot

df

In [None]:
inputs_cols=['age','bmi','children','smoker_codes','northeast', 'northwest','sex_codes', 'southeast', 'southwest']

inputs,target=df[inputs_cols],df['charges']

model=LinearRegression().fit(inputs,target)

predictions=model.predict(inputs)

loss=rmse(target,predictions)

loss


In [None]:
df[inputs_cols].loc[10]

In [None]:
model.predict([[28,30,2,1,0,0,1,0,0]])