<a href="https://colab.research.google.com/github/BenWhann/ACME-Insurance-ML/blob/main/ACME_Insurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# Problem Statememt
# QUESTION: ACME Insurance Inc. offers affordable health insurance to thousands of customer all over the United States. As the lead data scientist at ACME, you're tasked with creating an automated system to estimate
# the annual medical expenditure for new customers, using information such as their age, sex, BMI, children, smoking habits and region of residence.
# Estimates from your system will be used to determine the annual insurance premium (amount paid every month) offered to the customer. Due to regulatory requirements, you must be able to explain why your system outputs
# a certain prediction.
# You're given a CSV file containing verified historical data, consisting of the aforementioned information and the actual medical charges incurred by over 1300 customers.

# My objective is to find a way to estimate the value of the charges column using values in the other columns. If I can do so with historical data, then I should be able to with new customers too,
# simply by asking for information like their age, sex, BMI, number of kids, smoking habits, and region.

medical_charges_url = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv'
from urllib.request import urlretrieve
urlretrieve(medical_charges_url, 'medical.csv')

!pip install pandas --quiet
!pip install plotly --quiet
!pip install matplotlib --quiet
!pip install seaborn --quiet
!pip install scikit-learn --quiet
import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

medical_df = pd.read_csv('medical.csv')
medical_df.info()
medical_df.describe()

# figAge = px.histogram(medical_df, x='age', marginal='box', nbins=47, title='Age Distribution')
# figAge.update_layout(bargap=0.1)
# figAge.show()
# There are over twice as many customers between ages 18-19 compared to the rest of the data.

# figBMI = px.histogram(medical_df, x='bmi', marginal='box', color_discrete_sequence=['red'], title='BMI Distribution')
# figBMI.update_layout(bargap=0.1)
# figBMI.show()
# BMI appears to be approximately normally distributed

# figCharges = px.histogram(medical_df, x='charges', marginal='box', color='smoker', color_discrete_sequence=['green', 'grey'], title='Charges Distribution (Smoker')
# figSmokerProportion = px.histogram(medical_df, x='smoker', color='sex', title='Smoker Proportion by Sex')
# figCharges.update_layout(bargap=0.1)
# figSmokerProportion.update_layout(bargap=0.1)
# figSmokerProportion.show()
# figCharges.show()
# medical_df.smoker.value_counts()
# Median annual charges for non smokers is 7345.405. Smokers spend far more annually (shocker). Median for smokers is just over 34k annually. Power law distribution.

# figChargesSex = px.histogram(medical_df, x='charges', marginal='box', color='sex', color_discrete_sequence=['pink', 'blue'], title='Charges Distribution (Sex)')
# figSexProportion = px.histogram(medical_df, x='sex',title='Proportion of Sex')
# figSexProportion.update_layout(bargap=0.1)
# figChargesSex.update_layout(bargap=0.1)
# figSexProportion.show()
# figChargesSex.show()
# Charges by sex

# figChargesRegion = px.histogram(medical_df, x='charges', marginal='box', color='region', color_discrete_sequence=['red', 'blue', 'green', "yellow"], title='Charges Distribution (Region)')
# figRegionProportion = px.histogram(medical_df, x='region',title='Proportion of Region')
# figRegionProportion.update_layout(bargap=0.1)
# figChargesRegion.update_layout(bargap=0.1)
# figRegionProportion.show()
# figChargesRegion.show()
# Charges by region

# figChildrenProportion = px.histogram(medical_df, x='children',title='Proportion of Children')
# figChildrenProportion.update_layout(bargap=0.1)
# figChildrenProportion.show()
# Children proportion

# figAgeCharges = px.scatter(medical_df, x='age', y='charges', color='smoker', opacity=0.8, hover_data=['sex'], title='Age vs Charges')
# figAgeCharges.show()
# Age vs Charges

# figBMICharges = px.scatter(medical_df, x='bmi', y='charges', color='smoker', opacity=0.8, hover_data=['sex'], title='BMI vs Charges')
# figBMICharges.show()
# BMI vs Charges

# figChildrenCharges = px.violin(medical_df, x='children', y='charges', title='Children vs Charges')
# #figChildrenCharges.show()
# Children vs Charges

# figSmokerCharges = px.violin(medical_df, x='smoker', y='charges', title='Smoker vs Charges')
# figSmokerCharges.show()
# Smoker vs Charges

# figRegionCharges = px.violin(medical_df, x='region', y='charges', title='Region vs Charges')
# figRegionCharges.show()
# Region vs Charges

# figSexCharges = px.violin(medical_df, x='sex', y='charges', title='Sex vs Charges')
# figSexCharges.show()
# Sex vs Charges

ageChargeCorr = medical_df.charges.corr(medical_df.age)
# Age vs Charges correlation

bmiChargeCorr = medical_df.charges.corr(medical_df.bmi)
# BMI vs Charges correlation

childrenChargeCorr = medical_df.charges.corr(medical_df.children)
# Children vs Charges correlation

print(f'Age vs Charges correlation: {ageChargeCorr}')
# weak positive corr
print(f'BMI vs Charges correlation: {bmiChargeCorr}')
# weak positive corr
print(f'Children vs Charges correlation: {childrenChargeCorr}')
# no correlation

# Converting categprical smoker data to numerical data
smoker_values = {'no': 0, 'yes': 1}
# provides a dictionary for the map function
smoker_numeric = medical_df.smoker.map(smoker_values)
# map function
smokerChargeCorr = medical_df.charges.corr(smoker_numeric)
# calculate correlation coefficient
print(f'Smoker vs Charges correlation: {smokerChargeCorr}')
# strong positive correlation

correlation_matrix = medical_df.corr(numeric_only=True)
print(correlation_matrix)

sns.heatmap(correlation_matrix, annot=True, cmap='Blues')
plt.title('Correlation Matrix for Medical Data')







<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
Age vs Charges correlation: 0.2990081933306476
BMI vs Charges correlation: 0.19834096883362895
Children vs Charges correlation: 0.06799822684790478
Smoker vs Charges correlation: 0.787251430498478
               age       bmi  children   charges
age       1.000000  0.109272  0.042469  0.299008
bmi       0.109272  1.000000  0.012759  0.198341
children  0.042469  0.012759  1.000000  0.067998
charges   0.299008  0.198341  0.067998  1.000000
