# Building Linear Regression Model to Predict Premium Charges

* Importing Libraries
* Import the dataset
* Exploratory Data Analysis
* Data Visualization
* Feature Engineering/Selection
* Model Building


Importing libraries 


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


Importing the Dataset

In [None]:
insurance = pd.read_csv('insurance.csv')
insurance


EDA - Exploratory Data Analysis, we can find the following information about the data

In [None]:
'''
1. There are 1138 records and 13 columns, 10 are of float datatype, and 3 are of object datatype
2. past_consultations	num_of_steps	Hospital_expenditure	NUmber_of_past_hospitalizations	Anual_Salary	 are having
high correlation in lowest to highest order.
3. There are 52 null values in entire dataset

'''
insurance.head(10)
insurance.shape
insurance.dtypes
insurance.info()
insurance.isnull().sum(), insurance.isna().sum()
insurance.corr(numeric_only = True)



#Finding the percentage of null values 
'''
Since the null values present in dataset for each column is less than 1% for almost all columns, we will drop the null values
because it will not impact or effect the dataset.
We drop null values : 1. When dataset is huge | 2. When there are little null values
We will fill the null values : 1. When the dataset is small | 2. When there are more null values
'''
print(insurance.isnull().sum()/len(insurance)*100)
insurance.dropna(inplace=True)







#Finding outlier of Claim_Amount column. We can find outliers for others columns also individually manually, we can automate the same using for loop
q3 = insurance['Claim_Amount'].quantile(0.75)
q1 = insurance['Claim_Amount'].quantile(0.25)
UF = q3 + 1.5*IQR
LF = q1 - 1.5*IQR
insurance[insurance['Claim_Amount']>UF] #Values above Upper fence are outliers 
insurance[insurance['Claim_Amount']<LF] #Values below Lower fence are outliers
insurance = insurance[(insurance['Claim_Amount']<=UF) & (insurance['Claim_Amount']>=LF)]


Data Visualization

In [None]:
#Plotting countplot for 5 columns.
countplot_columns = ['children', 'smoker', 'region', 'sex','NUmber_of_past_hospitalizations']

plt.subplots(2,3, figsize=(15,5))
plt.subplot(2,3,1)
sns.countplot(x=insurance['children'])
                
plt.subplot(2,3,2)
sns.countplot(x=insurance['smoker'])

plt.subplot(2,3,3)
sns.countplot(x=insurance['region'])

plt.subplot(2,3,4)
sns.countplot(x=insurance['sex'])

plt.subplot(2,3,5)
sns.countplot(x=insurance['NUmber_of_past_hospitalizations'])








#Seeing distribution of age. This shows the people from age group 20 to 23 are more
sns.displot(insurance['age'])
plt.show()










#Seeing the distribution of BMI. This shows the normal distribution it means most of the BMIs are concentrated at center
#or at mean i.e. at 30 BMI and distributed equally on both the side, the frequency is decreasing from mean.
sns.displot(insurance['bmi'])
plt.show()









Feature Selection/Engineering

In [None]:
#Label Encoding - It means converting the object type to numeric type data for making them a eligible feature for our training. 
#After label_encoding now we can plot a graph.
#We can perform more operations on numeric type data. We convert the categorical data into labels, say flowers columns, with rose, lilly and sunflower, we labell rose = 1, lilly = 2, sunflower = 3
from sklearn.preprocessing import LabelEncoder
#le object is inferring from LabelEncoder class 
le=LabelEncoder()
#Transforming the categorical column into labels i.e. into numeric type data
insurance['smoker'] = le.fit_transform(insurance['smoker'])
insurance['region'] = le.fit_transform(insurance['region'])
insurance['sex'] = le.fit_transform(insurance['sex'])



#We are not dropping any features from the dataset we have just transformed or label encoded few columns.




Model Building

In [None]:
#splitting the dependent and independent data
x = insurance.drop('charges', axis = 1)
y = insurance.iloc[:,-1] #Alternative ways : insurance.charges or insurance['charges']

        


    
    
#Splitting data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(x, y, train_size = 0.8, random_state = 0)







#Building linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
#Training the model
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)









#Checking performance/accuracy of the model using Performance Metrics. * because it will pull out all the functions of classes from sklearn.metrics
from sklearn.metrics import *
r2_score(y_test, y_pred)
#The R2 score is 0.99228 and it is a good accuracy of the model. To validate if the R2_score is good, we will plot a best fit line






#Creating Dataframe with Actual and Predicted values
best_fit_line = pd.DataFrame(columns = ['Actual', 'Predicted'])
best_fit_line['Actual'] = y_test
best_fit_line['Predicted'] = y_pred






#Error or Residual between Actual and Predicted values
best_fit_line['Error'] = best_fit_line['Actual'] - best_fit_line['Predicted']
print(f'The actual and predicted values along with error/residual:\n{best_fit_line}')



#Plotting scatter plot to understand the actual vs predicted and drawing a best fit line
plt.figure(figsize=(10,5))
slope, intercept = np.polyfit(best_fit_line['Actual'], best_fit_line['Predicted'],1)
best_fit_line1 = slope*best_fit_line['Actual'] + intercept
sns.scatterplot(x=best_fit_line['Actual'], y=best_fit_line['Predicted'])
plt.plot(best_fit_line['Actual'], best_fit_line1, color = 'black')
plt.xlabel('True_Values')
plt.ylabel('Predicted_Values')
plt.show()