In [1]:
# Importing library
import altair as alt
print(alt.__version__)

alt.data_transformers.disable_max_rows()

5.0.1


DataTransformerRegistry.enable('default')

### **Data Preparation**

In [2]:
import pandas as pd

# load dataset
df = pd.read_csv('insurance.csv')
df.head()
print(f'Shape of the data: {df.shape}') 

Shape of the data: (1338, 7)


In [3]:
# check duplicate value
df[df.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [4]:
# drop duplicate data
df.drop_duplicates(keep='first', inplace=True)

In [5]:
# classifies into 5 groups based on BMI
def convert(bmi):
    category = ''
    if bmi > 35.0:
        category = 'extremely obese'
    elif 30.0 < bmi < 34.9:
        category = 'obese'
    elif 25.0 < bmi < 29.9:
        category = 'overweight'
    elif 18.5 < bmi < 24.9:
        category = 'normal'
    else:
        category = 'underweight'
    return category

df = df.assign(category=[convert(x) for x in df.bmi])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,category
0,19,female,27.9,0,yes,southwest,16884.924,overweight
1,18,male,33.77,1,no,southeast,1725.5523,obese
2,28,male,33.0,3,no,southeast,4449.462,obese
3,33,male,22.705,0,no,northwest,21984.47061,normal
4,32,male,28.88,0,no,northwest,3866.8552,overweight


From this data skimming we know some important info :
- the data comprises of **1388 rows** and **8 columns**
- the data comprises of **4 qualitative data**: Sex (nominal), Smoker (nominal), Region (nominal), and category (ordinal)
- the data comprises of **4 quantitative data**: Age (continue), BMI (continue), Children (discreate), and Charges (discreate)
- the data **already clean** and ready for analysis

In [6]:
chart = alt.Chart(data=df)

### **Analysis of Charges Variable**

**1. How the distribution is characterized?**

In [7]:
# Hitung median dan MAD
median = df.charges.median()
mad = abs(df["charges"] - df.charges.median()).median()
print(f'Lower {median-mad} and Upper {median+mad}')

# Hitung persentase pasien pada rentang tersebut
mask = (df.charges > median-mad) & (df.charges < median+mad)
print(f'Pct {len(df[mask])/len(df)*100}%')

Lower 4377.7647 and Upper 14394.5579
Pct 49.962602842184%


In [8]:
# Membuat histogram
chart.mark_bar().encode(
    x=alt.X("charges", title='Medical Costs').bin(maxbins=50).axis(format='s'),
    y=alt.Y('count()', title='Frequency'),
    tooltip='count()',
    color=alt.Color('count()').scale(scheme="lightgreyred",)
).properties(
    title=alt.Title(
        "Medical Bills for 50% of Patients Ranged from 4000 to 14000 USD",
        subtitle='Distribution of Medical Cost (in dollars)',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=800, height=250,
)

**2. Is there a difference in the distribution of health bills between smokers and non-smokers?**

In [9]:
# Hitung median charges
source = df.pivot_table(index='smoker', values='charges', aggfunc='median')

# Hitung rasio bill
source.loc['yes', 'charges']/source.loc['no', 'charges']

4.6906657879153855

In [10]:
# Set color scale
color_scale = alt.Scale(
        domain=['yes', 'no'], 
        range=['#CE2029', 'grey'])

# Membuat boxplot
chart.mark_boxplot().encode(
    x=alt.X("charges", title='Medical Costs'),
    y=alt.Y('smoker').axis(None),
    color=alt.Color('smoker', scale=color_scale),
).properties(
    title=alt.Title(
        "Medical Bills of Smoker are 5x Higher than Non-smokers",
        subtitle='Distribution of Medical Cost (in dollars) by Smoker Statue',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=600, height=150
)

### **Analysis of BMI Variable**

**1. Which BMI category has the highest percentage of users?**

In [11]:
# Menghitung presentase tiap grup BMI
source = df.category.value_counts(normalize=True).to_frame().reset_index(names='predicate')
display(source)

Unnamed: 0,predicate,category
0,obese,0.28721
1,overweight,0.278235
2,extremely obese,0.23635
3,normal,0.165295
4,underweight,0.032909


In [12]:
# Set color scale
color_scale = alt.Scale(
        domain=['obese', 'overweight', 'extremely obese', 'normal', 'underweight'], 
        range=['#CE2029', '#CE2029', 'grey', 'grey', 'grey'])

# Menampilkan bar chart
alt.Chart(source).mark_bar().encode(
    x=alt.X('category', title='Percent of Total').axis(format='%'),
    y=alt.Y('predicate', title=None, sort='-x'),
    tooltip=alt.Tooltip('category', format='.2%'),
    color=alt.Color('predicate', scale=color_scale, legend=None),
).properties(
    title=alt.Title(
        "55% of Patients Indicated Obesity and Overweight",
        subtitle='Patient Percentage (%) by BMI Category',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=400, height=200,
)

**2. What is the ratio (in percentage) of the number of users between male and female who fall into the normal category?**

In [13]:
# Menghitung presentase tiap grup BMI per gender
source = pd.crosstab(index=df.sex, columns=df.category, normalize='all').stack().to_frame().reset_index()
source.rename(columns={0: 'pct'}, inplace=True)
display(source)

Unnamed: 0,sex,category,pct
0,female,extremely obese,0.109948
1,female,normal,0.086013
2,female,obese,0.138369
3,female,overweight,0.140613
4,female,underweight,0.020194
5,male,extremely obese,0.126402
6,male,normal,0.079282
7,male,obese,0.148841
8,male,overweight,0.137622
9,male,underweight,0.012715


In [14]:
# Set color scale
color_scale = alt.Scale(
        domain=['female', 'male'], 
        range=['#CE2029', 'grey'])

# Membuat Stacked Bar
alt.Chart(source).mark_bar().encode(
    x=alt.X('pct', title='Percent of Total').axis(format='%'),
    y=alt.Y('category', title=None),
    tooltip=alt.Tooltip('pct', format='.2%'),
    color=alt.Color('sex', scale=color_scale)
).properties(
    title=alt.Title(
        "Female Patients with Normal BMI were 0.7% Higher than Male",
        subtitle='Patient Percentage (%) by BMI Category',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=500, height=200,
)

### **Analysis of Region Variable**

**1. Does each region have the same proportion of people?**

In [15]:
# Menghitung persentase per region
source = df.region.value_counts(normalize=True).to_frame().reset_index()
source.rename(columns={'index': 'region', 'region':'pct'}, inplace=True)
display(source)

Unnamed: 0,region,pct
0,southeast,0.272251
1,southwest,0.243082
2,northwest,0.242334
3,northeast,0.242334


In [16]:
# Set color
color_scale = alt.Scale(
    domain=['northeast', 'northwest', 'southeast', 'southwest'],
    range=['#e7ba52', '#a7a7a7', '#aec7e8', '#1f77b4']
)

# Membuat basis grafik
base = alt.Chart(source).encode(
    alt.Theta("pct").stack(True),
    alt.Color("region")
        .title(None)
        .scale(color_scale)
        .legend(orient='none', legendX=300, legendY=120),
    order='pct',
)

# Membuat pie chart
pie = base.mark_arc(outerRadius=120, innerRadius=80)

# Membuat teks persentase
pct = base.mark_text(radius=150, size=15).encode(
    text=alt.Text('pct', format='.2%'),)

# Membuat teks total pasien
annotation = base.mark_text(
    text=['1337', "Patient's"],
    align='center',
    font='Calibri',
    fontSize=25,
    dy=-20,
)
# Menggabungkan chart dengan teks
(pie + pct + annotation).properties(
    title=alt.Title(
        "Each Region had Patient Percentages Ranging from 24% to 27%",
        subtitle='Patient Percentage (%) by Region',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=300, height=300
)

**2. Is there a relationship between BMI and charges for each region?**

In [17]:
# Top panel is scatter plot of temperature vs time
alt.Chart(df).mark_point().encode(
    x=alt.X('bmi').scale(zero=False),
    y=alt.Y('charges'),
    color=alt.Color('region').scale(color_scale)
).properties(
    title=alt.Title(
        "There is No Relationship between BMI and Total Charges",
        subtitle='Data Distribution between BMI and Total Charges',
        anchor='start',
        font='Calibri',
        fontSize=18,
        offset=20,
    ),
    width=550,
    height=300
)