# U.S. Medical Insurance Costs

In [34]:
#  Project from Codecademy involving analysis of U.S. Medical Insurance Data
#  Collaboration with DenisLazuk and Olubunmi-Amoke

# -----------------------------------------------------------

#  Using pandas to read in the data frame.
import pandas as pd
# -----------------------------------------------------------


#  Read in dataset from csv file and create dataframe. Print header to inspect data.
df = pd.read_csv('insurance.csv')
print(df)

# -----------------------------------------------------------
#  Creating a separate dataframe demonstrating how to filter values from a category.
females = df[df.sex=='female']
print(females) # print the result

# -----------------------------------------------------------
#  Filter original dataframe for people who have at least one child
one_child_plus = df[df.children>0]
print(one_child_plus)

# -----------------------------------------------------------
#  Average age of the data points with at least one child
number_of_data_points = len(one_child_plus)
print(sum(one_child_plus['age'])/number_of_data_points)

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]
      age     sex    bmi  children smoker     region      charges
0      19  female  27.90         0    yes  southwest  16884.92400
5      31  female  25.74         0     

In [97]:
#  Let's calculate how the presence of children in a family affect the value of insurance charges
#  First we find the median value of insurance charges for families with at least one child
# -----------------------------------------------------------
import statistics
med_one_plus = round(statistics.median(one_child_plus['charges']),1)
print ("The median value of insurance charges for families with at least one child comprises {} US dollars.".format(med_one_plus))

# -----------------------------------------------------------
# Than we find the value for families without any children
no_child = df[df.children==0]
med_no = round(statistics.median(no_child['charges']),1)
print ("The median value of insurance charges for families without children comprises {} US dollars.".format(med_no))

# -----------------------------------------------------------
# We see that the charges for "no-child-families" are a bit higher.
# But how are families with children stratified?

one_child = df[df.children==1]
two_child = df[df.children==2]
many_child = df[df.children>2]
med_one = round(statistics.median(one_child['charges']),1)
med_two = round(statistics.median(two_child['charges']),1)
med_many = round(statistics.median(many_child['charges']),1)
print ("The median value of insurance charges for families with a single child comprises {} US dollars.".format(med_one))
print ("The median value of insurance charges for families with two children comprises {} US dollars.".format(med_two))
print ("The median value of insurance charges for families with more than two children comprises {} US dollars.".format(med_many))
             
    
# -----------------------------------------------------------
# We see that the more children in a family the higher the charges
# What if we make our analysis of families with kids more advanced and try to take into account the smoker status?
one_child_smoker = df[(df.children==1) & (df.smoker=='yes')]
one_child_no_smoker = df[(df.children==1) & (df.smoker=='no')]
med_one_smoker = round(statistics.median(one_child_smoker['charges']),1)
med_one_no_smoker = round(statistics.median(one_child_no_smoker['charges']),1)
print ("The median value of insurance charges for a smoking person with a single child comprises {} US dollars while for a non-smoker the amount is only {} US dollars.".format(med_one_smoker,med_one_no_smoker))


two_child_smoker = df[(df.children==2) & (df.smoker=='yes')]
two_child_no_smoker = df[(df.children==2) & (df.smoker=='no')]
med_two_smoker = round(statistics.median(two_child_smoker['charges']),1)
med_two_no_smoker = round(statistics.median(two_child_no_smoker['charges']),1)
print ("The median value of insurance charges for a smoking person with two children comprises {} US dollars while for a non-smoker the amount is only {} US dollars.".format(med_two_smoker, med_two_no_smoker))

many_child_smoker = df[(df.children>2) & (df.smoker=='yes')]
many_child_no_smoker = df[(df.children>2) & (df.smoker=='no')]
med_many_smoker = round(statistics.median(many_child_smoker['charges']),1)
med_many_no_smoker = round(statistics.median(many_child_no_smoker['charges']),1)
print ("The median value of insurance charges for a smoking person with many children comprises {} US dollars while for a non-smoker the amount is only {} US dollars.".format(med_many_smoker, med_many_no_smoker))


# The first insight we see that a smoker person with many children pay LESS than one-or-two-child person!
# May be because a person with many children has a smaller BMI? May be there is another factor? Let's check it!


The median value of insurance charges for families with at least one child comprises 9223.8 US dollars.
The median value of insurance charges for families without children comprises 9857.0 US dollars.
The median value of insurance charges for families with a single child comprises 8483.9 US dollars.
The median value of insurance charges for families with two children comprises 9265.0 US dollars.
The median value of insurance charges for families with more than two children comprises 10483.8 US dollars.
The median value of insurance charges for a smoking person with a single child comprises 34806.5 US dollars while for a non-smoker the amount is only 7050.6 US dollars.
The median value of insurance charges for a smoking person with two children comprises 38344.6 US dollars while for a non-smoker the amount is only 7726.9 US dollars.
The median value of insurance charges for a smoking person with many children comprises 30942.2 US dollars while for a non-smoker the amount is only 8605.4 