# U.S. Medical Insurance Costs

In [2]:
import csv
import pandas as pd

#import insurance data to list
def import_csv(file):
    with open('insurance.csv') as insurance_data:
        reader = csv.DictReader(insurance_data)
        insurance_info = []
        for row in reader:
            insurance_info.append(row)
    return insurance_info

insurance_info = pd.read_csv('insurance.csv') 
print(insurance_info.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


<hr>

## Cleaning Data

The data received in the dataset has been pre-cleaned. However I will change the datatypes of the 'sex', 'smoker', and 'region' columns to improve usability in computations. The 'region' column will be converted from a pandas object to a string. The 'sex' and 'smoker' columns are both binary data and will be converted to booleans.

<hr>

In [3]:
insurance_info['region'] = insurance_info['region'].astype('string')
insurance_info = insurance_info.replace({'smoker': {'yes': True, 'no': False}})
insurance_info = insurance_info.replace({'sex': {'female': 0, 'male': 1}})
insurance_info['sex'] = insurance_info['sex'].astype('bool')

  insurance_info = insurance_info.replace({'smoker': {'yes': True, 'no': False}})
  insurance_info = insurance_info.replace({'sex': {'female': 0, 'male': 1}})


<hr>

## Classifying Customers

The below block of code adds an additional column to the dataframe to assign a BMI classification based on specifications from the US Center for Disease Control.
<hr>

In [4]:
weight_classification = lambda row: 'underweight' if row['bmi'] < 18.5 else \
    ('healthy' if row['bmi']  > 18.5 and row['bmi']  < 25 else
    ('overweight' if row['bmi'] > 25 and row['bmi'] < 30 else
    ('obese class 1' if row['bmi'] > 30 and row['bmi'] < 35 else
    ('obese class 2' if row['bmi'] > 35 and row['bmi'] < 40 else
    ('obese class 3 (severely obese)')))))  

insurance_info['classification'] = insurance_info.apply(weight_classification, axis=1)
print(insurance_info.head())

   age    sex     bmi  children  smoker     region      charges classification
0   19  False  27.900         0    True  southwest  16884.92400     overweight
1   18   True  33.770         1   False  southeast   1725.55230  obese class 1
2   28   True  33.000         3   False  southeast   4449.46200  obese class 1
3   33   True  22.705         0   False  northwest  21984.47061        healthy
4   32   True  28.880         0   False  northwest   3866.85520     overweight


<hr>

## Charges By Class 

The section below determines and prints the average charges per CDC classification, rounded to two decimal places for ease of reading. After determining the average for each classification, it determines which classification has the highest overall average and the lowest overall average.

<hr>

In [23]:
avg_charge_by_class = insurance_info.groupby('classification').charges.mean().sort_values().reset_index()
avg_charge_by_class = avg_charge_by_class.round(2)

min_charges = "\nThe " + avg_charge_by_class['classification'][0] + " classification has the lowest average charges with $" + str(avg_charge_by_class['charges'][0]) 
max_charges = "\nThe " + avg_charge_by_class['classification'].iloc[-1] + " classification has the highest average charges with $" + str(avg_charge_by_class['charges'].iloc[-1]) 

print(avg_charge_by_class)
print(min_charges)
print(max_charges)

                   classification   charges
0                     underweight   8852.20
1                         healthy  10434.53
2                      overweight  10989.85
3                   obese class 1  14429.42
4  obese class 3 (severely obese)  16440.51
5                   obese class 2  17022.26

The underweight classification has the lowest average charges with $8852.2

The obese class 2 classification has the highest average charges with $17022.26


<hr>

## BMI By Region

The section below determines and prints the average BMI for each residential region in the data set. After determining the average for each region, the code will determine which has the highest overall average and the lowest overall average. 

<hr>

In [21]:
bmi_by_region = insurance_info.groupby('region').bmi.mean().reset_index()
bmi_by_region['classification'] = bmi_by_region.apply(weight_classification, axis=1)
bmi_by_region = bmi_by_region.sort_values(by='bmi').reset_index(drop=True)

min_bmi_by_region = "\n The " + bmi_by_region.region.iloc[0] + " region has the lowest average bmi at " + str(bmi_by_region.bmi.iloc[0]) + ", with an average classification of \"" + bmi_by_region.classification.iloc[0] + "\"" 
max_bmi_by_region = "\n The " + bmi_by_region.region.iloc[-1] + " region has the highest average bmi at " + str(bmi_by_region.bmi.iloc[-1]) + ", with an average classification of \"" + bmi_by_region.classification.iloc[-1] + "\"" 

print(bmi_by_region)
print(min_bmi_by_region)
print(max_bmi_by_region)

      region        bmi classification
0  northeast  29.173503     overweight
1  northwest  29.199785     overweight
2  southwest  30.596615  obese class 1
3  southeast  33.355989  obese class 1

 The northeast region has the lowest average bmi at 29.173503086419753, with an average classification of "overweight"

 The southeast region has the highest average bmi at 33.35598901098901, with an average classification of "obese class 1"


<hr>

## Charges by Region
The section below determines the average insurance charges for each region, then determines which region has the highest average charges and which has the lowest average charges.

<hr>

In [24]:
charge_by_region = insurance_info.groupby('region').charges.mean().reset_index()
charge_by_region = charge_by_region.round(2)
charge_by_region = charge_by_region.sort_values(by='charges').reset_index(drop=True)

min_charge_by_region = "\n The " + charge_by_region.region.iloc[0] + " region has the lowest average charges with $" + str(charge_by_region.charges.iloc[0])
max_charge_by_region = "\n The " + charge_by_region.region.iloc[-1] + " region has the highest average charges with $" + str(charge_by_region.charges.iloc[-1])

print(charge_by_region)
print(min_charge_by_region)
print(max_charge_by_region)

      region   charges
0  southwest  12346.94
1  northwest  12417.58
2  northeast  13406.38
3  southeast  14735.41

 The southwest region has the lowest average charges with $12346.94

 The southeast region has the highest average charges with $14735.41


<hr>

## Smokers and BMI 

This section separates customers in the data set by their smoker status and then again by region, in order to compare whether smokers have higher average BMIs than non-smokers, and how this relates to their average charges. 

In [73]:
bmi_by_region_smoker_status = insurance_info.groupby(['region', 'smoker']).bmi.mean().reset_index()
bmi_by_region_smoker_status['classification'] = bmi_by_region_smoker_status.apply(weight_classification, axis=1)


charges_by_region_smoker_status = insurance_info.groupby(['region', 'smoker']).charges.mean().reset_index()
charges_by_region_smoker_status['charges']=charges_by_region_smoker_status['charges'].round(2)

charges_and_class_by_region = pd.merge(charges_by_region_smoker_status, bmi_by_region_smoker_status)

print(charges_and_class_by_region)

northeast_smokers = insurance_info[(insurance_info['smoker'] == True) & (insurance_info['region'] == 'northeast')].reset_index(drop=True)
northeast_nonsmokers = insurance_info[(insurance_info['smoker'] == False) & (insurance_info['region'] == 'northeast')].reset_index(drop=True)

northwest_smokers = insurance_info[(insurance_info['smoker'] == True) & (insurance_info['region'] == 'northwest')].reset_index(drop=True)
northwest_nonsmokers = insurance_info[(insurance_info['smoker'] == False) & (insurance_info['region'] == 'northwest')].reset_index(drop=True)

southeast_smokers = insurance_info[(insurance_info['smoker'] == True) & (insurance_info['region'] == 'southeast')].reset_index(drop=True)
southeast_nonsmokers = insurance_info[(insurance_info['smoker'] == False) & (insurance_info['region'] == 'southeast')].reset_index(drop=True)

southwest_smokers = insurance_info[(insurance_info['smoker'] == True) & (insurance_info['region'] == 'southwest')].reset_index(drop=True)
southwest_nonsmokers = insurance_info[(insurance_info['smoker'] == False) & (insurance_info['region'] == 'southwest')].reset_index(drop=True)


data_synopsis = "\nThe {region} has {smokers} smokers, with average charges of ${smokers_charge} and an average BMI of {smokers_bmi}. The same region has {non_smokers} non-smokers with average charges of ${nonsmokers_charge} and an average BMI of {nonsmokers_bmi}. There's a {bmi_difference} point difference in average BMI and a {charges_difference}% difference in average charges between smokers and non-smokers. {percent_smoker}% of the customers in the region are smokers."
print(data_synopsis.format(region=northeast_smokers['region'][0].title(),\
                          smokers=len(northeast_smokers),\
                          smokers_charge=northeast_smokers.charges.mean().round(2),\
                          smokers_bmi=northeast_smokers.bmi.mean(),\
                          non_smokers=len(northeast_nonsmokers),\
                          nonsmokers_charge=northeast_nonsmokers.charges.mean().round(2),\
                          nonsmokers_bmi= northeast_nonsmokers.bmi.mean(),\
                          bmi_difference=abs((northeast_nonsmokers.bmi.mean() - northeast_smokers.bmi.mean())),\
                          charges_difference=abs(round((((northeast_smokers.charges.mean() - northeast_nonsmokers.charges.mean()) * 100) / northeast_nonsmokers.charges.mean()), 2)),\
                          percent_smoker=(((round((len(northeast_smokers) / (len(northeast_smokers)+len(northeast_nonsmokers))*100), 2))))))

print(data_synopsis.format(region=northwest_smokers['region'][0].title(),\
                          smokers=len(northwest_smokers),\
                          smokers_charge=northwest_smokers.charges.mean().round(2),\
                          smokers_bmi=northwest_smokers.bmi.mean(),\
                          non_smokers=len(northwest_nonsmokers),\
                          nonsmokers_charge=northwest_nonsmokers.charges.mean().round(2),\
                          nonsmokers_bmi= northwest_nonsmokers.bmi.mean(),\
                          bmi_difference=abs((northwest_nonsmokers.bmi.mean() - northwest_smokers.bmi.mean())),\
                          charges_difference=abs(round((((northwest_smokers.charges.mean() - northwest_nonsmokers.charges.mean()) * 100) / northwest_nonsmokers.charges.mean()), 2)),\
                          percent_smoker=(((round((len(northwest_smokers) / (len(northwest_smokers)+len(northwest_nonsmokers))*100), 2))))))

print(data_synopsis.format(region=southeast_smokers['region'][0].title(),\
                          smokers=len(southeast_smokers),\
                          smokers_charge=southeast_smokers.charges.mean().round(2),\
                          smokers_bmi=northeast_smokers.bmi.mean(),\
                          non_smokers=len(southeast_nonsmokers),\
                          nonsmokers_charge=southeast_nonsmokers.charges.mean().round(2),\
                          nonsmokers_bmi= southeast_nonsmokers.bmi.mean(),\
                          bmi_difference=abs((southeast_nonsmokers.bmi.mean() - southeast_smokers.bmi.mean())),\
                          charges_difference=abs(round((((southeast_smokers.charges.mean() - southeast_nonsmokers.charges.mean()) * 100) / southeast_nonsmokers.charges.mean()), 2)),\
                          percent_smoker=(((round((len(southeast_smokers) / (len(southeast_smokers)+len(southeast_nonsmokers))*100), 2))))))

print(data_synopsis.format(region=southwest_smokers['region'][0].title(),\
                          smokers=len(southwest_smokers),\
                          smokers_charge=southwest_smokers.charges.mean().round(2),\
                          smokers_bmi=southwest_smokers.bmi.mean(),\
                          non_smokers=len(southwest_nonsmokers),\
                          nonsmokers_charge=southwest_nonsmokers.charges.mean().round(2),\
                          nonsmokers_bmi= southwest_nonsmokers.bmi.mean(),\
                          bmi_difference=abs((southwest_nonsmokers.bmi.mean() - southwest_smokers.bmi.mean())),\
                          charges_difference=abs(round((((southwest_smokers.charges.mean() - southwest_nonsmokers.charges.mean()) * 100) / southwest_nonsmokers.charges.mean()), 2)),\
                          percent_smoker=(((round((len(southwest_smokers) / (len(southwest_smokers)+len(southwest_nonsmokers))*100), 2))))))

      region  smoker   charges        bmi classification
0  northeast   False   9165.53  29.332082     overweight
1  northeast    True  29673.54  28.565224     overweight
2  northwest   False   8556.46  29.212678     overweight
3  northwest    True  30192.00  29.140431     overweight
4  southeast   False   8032.22  33.442418  obese class 1
5  southeast    True  34845.00  33.096703  obese class 1
6  southwest   False   8019.28  30.507865  obese class 1
7  southwest    True  32269.06  31.005172  obese class 1

The Northeast has 67 smokers, with average charges of $29673.54 and an average BMI of 28.565223880597014. The same region has 257 non-smokers with average charges of $9165.53 and an average BMI of 29.33208171206226. There's a 0.766857831465245 point difference in average BMI and a 223.75% difference in average charges between smokers and non-smokers. 20.68% of the customers in the region are smokers.

The Northwest has 58 smokers, with average charges of $30192.0 and an average BMI