### **The aim of this project is to build a model capable of predicting the price of insurance based on various factors outlined in the dataset**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Datasets/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [None]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [None]:
df['sex']=df['sex'].apply (lambda x:1 if x=='male' else 0)

In [None]:
df['smoker']=df['smoker'].apply (lambda x:1 if x=='yes' else 0)

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,southwest,16884.92
1,18,1,33.8,1,0,southeast,1725.55
2,28,1,33.0,3,0,southeast,4449.46
3,33,1,22.7,0,0,northwest,21984.47
4,32,1,28.9,0,0,northwest,3866.86


In [None]:
df.drop(['region'],axis=1,inplace=True)

In [None]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,expenses
age,1.0,-0.020856,0.109341,0.042469,-0.025019,0.299008
sex,-0.020856,1.0,0.04638,0.017163,0.076185,0.057292
bmi,0.109341,0.04638,1.0,0.012645,0.003968,0.198576
children,0.042469,0.017163,0.012645,1.0,0.007673,0.067998
smoker,-0.025019,0.076185,0.003968,0.007673,1.0,0.787251
expenses,0.299008,0.057292,0.198576,0.067998,0.787251,1.0


In [None]:
X=df.drop(['expenses'],axis=1)
y=df['expenses']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
model=LinearRegression()
model.fit(X_train,y_train)

In [None]:
#training set accuracy
model.score(X_train,y_train)

0.7411142616557147

In [None]:
#test set accuracy
model.score(X_test,y_test)

0.7811077124524389

In [None]:
#making predictions
input_data = (18,1,32.5,2,1)
#changing input data to numpy array
input_data_as_numpy_array = np.asarray(input_data, dtype=np.float64)

#reshaping the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

[27658.8909274]




In [None]:
import pickle

In [None]:
filename = "trained_model.pkl"
pickle.dump(model,open(filename,'wb'))

In [None]:
#loding the saved model
loaded_model = pickle.load(open('trained_model.pkl','rb'))

In [None]:
input_data = (18,1,32.5,2,1)
#changing data to numpy array
input_data_as_numpy_array = np.asarray(input_data, dtype=np.float64)

#reshaping the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

[27658.8909274]


