# U.S. Medical Insurance Costs

In [47]:
import csv

insurance_list = dict()

with open('insurance.csv', newline='') as insurance_csv:
    insurance_reader = csv.DictReader(insurance_csv)
    record_num = 1
    for record in insurance_reader:
        insurance_list[record_num] = record
        record_num+= 1     

Summary Stats
Before exploring each data category individually I wanted to get a summary of the data overall.

In [60]:
#Decided to make a list of ages in order to access the age data better.
ages = []

for record in insurance_list:
    age = int(insurance_list[record]['age'])
    ages.append(age)

In [63]:
#Function to get the average age of all records in the list ages
def get_average_age(ages):
    total_age = 0
    for age in ages:
        total_age += age
    average_age = round((total_age/len(ages)))
    return average_age

#Calculate the average age of all patients
average_age = get_average_age(ages)
print("The average age of all patients is: " + str(average_age) + " years old.")

The average age of all patients is: 39 years old.


In [64]:
#Function to get the min and max age of records in the list ages
def get_min_max_age(ages):
    max_age = 0
    min_age = 100
    for age in ages:
        if age > max_age:
            max_age = age
    for age in ages:
        if age < min_age:
            min_age = age
    return max_age, min_age

#Calculate the max and min age of all patients
max_age, min_age = get_min_max_age(ages)
print("The oldest patient is " + str(max_age) + " years old. The youngest patient is " + str(min_age) + " years old.")

The oldest patient is 64 years old. The youngest patient is 18 years old.


In [66]:
#Create list of insurance costs
costs = []

for record in insurance_list:
    cost = float(insurance_list[record]['charges'])
    costs.append(cost)

In [84]:
#Function to get the average price of insurance
def get_average_insurance_cost(costs):
    total_cost = 0
    for cost in costs:
        total_cost += cost
    average_cost = round((total_cost/len(costs)),2)
    return average_cost

#Calculate the average cost of insurance
average_cost = get_average_insurance_cost(costs)
print("The average cost of insurance was $" + str(average_cost))

#Calculate the max cost
max_cost = round(max(costs),2)
min_cost = round(min(costs),2)

print("The max cost of insurance was $" + str(max_cost) + " while the minimum cost was $" + str(min_cost) + ".")

The average cost of insurance was $13270.42
The max cost of insurance was $63770.43 while the minimum cost was $1121.87.


In [72]:
#Find number of smokers vs non-smokers

num_of_smokers = 0
num_of_nonsmokers = 0

for record in insurance_list:
    status = insurance_list[record]['smoker']
    if status == 'yes':
        num_of_smokers += 1
    elif status == 'no':
        num_of_nonsmokers += 1
    else:
        pass

print("There were " + str(num_of_smokers) + " smokers and " + str(num_of_nonsmokers) + " non-smokers.")

There were 274 smokers and 1064 non-smokers.


In [74]:
#Find percentage of total patients that are smokers. Making it a general function so I can resuse it.
def find_percentage(portion, total):
    percentage = (portion * 100)/total
    return percentage

percentage_of_smokers = round((find_percentage(num_of_smokers, len(insurance_list))),2)
print("The percentage of patients who are smokers is " + str(percentage_of_smokers) + " percent.")

The percentage of patients who are smokers is 20.48 percent.


In [76]:
#Finding percentage of patients who have children 
num_of_parents = 0

for record in insurance_list:
    child_status = int(insurance_list[record]['children'])
    if child_status > 0:
        num_of_parents += 1
        
print("There were " + str(num_of_parents) + " patients who are parents.")

percentage_of_parents = round((find_percentage(num_of_parents, len(insurance_list))),2)
print("That is " + str(percentage_of_parents) + " percent of patients.")

There were 764 patients who are parents.
That is 57.1 percent of patients.


In [78]:
#Finding percentage of patients who are either gender
num_of_males = 0
num_of_females = 0

for record in insurance_list:
    gender = insurance_list[record]['sex']
    if gender == 'male':
        num_of_males += 1
    elif gender == 'female':
        num_of_females +=1
        
print(num_of_males, num_of_females)

percentage_of_males = round((find_percentage(num_of_males, len(insurance_list))),2)
print("Males make up " + str(percentage_of_males) + " percent of patients.")

676 662
Males make up 50.52 percent of patients.


In [89]:
#Finding all reported regions

regions = dict()
record_count = 0

for record in insurance_list:
    region = insurance_list[record]['region']
    if region not in regions:
        record_count += 1
        regions[region] = record_count
    else:
        regions[region] += 1
        
print(regions)

max_region = "None"
count = 0
for region in regions:
    new_count = regions[region]
    if new_count > count:
        count = new_count
        max_region = region
        
print("The region with the highest number of patient records is the " + str(max_region) + ".")

{'southwest': 325, 'southeast': 365, 'northwest': 327, 'northeast': 327}
The region with the highest number of patient records is the southeast.


SUMMARY:

Of the 1,338 patients whose records are in this data:
    50.52% were male while 49.48% were female giving a nearly even split between the two sexes.
    57.1% of patients were parents, leaving the data split slightly more than even between parents and non-parents.
    The data will be skewed towards the results of non-smokers as only 20.48% of patients were smokers.
    While the average age of patients was 39 years old, the oldest patient is 64 and the youngest is 18.
    The average cost of insurance accross patients was $13,270.42 though this is likely skewed from outliers ($63,770.43 max, $1,121.87 min).

EXPLORING AGE
For this section I wanted to explore trends in age. In particular, I wanted to look into the average insurance cost across age groups, the average bmi per age group, and the percentage of smokers in each age group. I predict that insurance costs will be higher in later age groups. I also predict that the average bmi and percentage of smokers will also increase with age due to slowing metabolism and the recent push from younger generations to quit smoking.

In [150]:
#Defining the age categories

age_categories = {'young': 30,
                  'middle': 60,
                  'elder': 90}

grouped_by_age = {'young': [],
                  'middle': [],
                  'elder': []}

for record in insurance_list:
    il_record = int(insurance_list[record]['age'])
    if il_record <= age_categories['young']:
        grouped_by_age['young'].append(insurance_list[record]['charges'])
    elif il_record > age_categories['young'] and il_record <= age_categories['middle']:
        grouped_by_age['middle'].append(insurance_list[record]['charges'])
    elif il_record > age_categories['middle'] and il_record <= age_categories['elder']:
        grouped_by_age['elder'].append(insurance_list[record]['charges'])
    elif il_record > age_categories['elder']:
        grouped_by_age['elder'].append(insurance_list[record]['charges'])


In [152]:
#Finding average insurance cost in each age grouping.

def age_avg(group):
    total_price = 0
    length = len(grouped_by_age[group])
    for cost in grouped_by_age[group]:
        total_price += float(cost)
    average_price = round((total_price/length),2)
    return average_price

age_avg_young = age_avg('young')
age_avg_middle = age_avg('middle')
age_avg_elder = age_avg('elder')

print("The average insurance cost for those 30 and younger was $" + str(age_avg_young) + ".")
print("The average insurance cost for those younger than 60 but older than 30 was $" + str(age_avg_middle) + ".")
print("The average insurance cost for those older than 60 was $" + str(age_avg_elder) + ".")

The average insurance cost for those 30 and younger was $9397.55.
The average insurance cost for those younger than 60 but older than 30 was $14528.72.
The average insurance cost for those older than 60 was $21063.16.
