# U.S. Medical Insurance Costs

The purpose of this project was to demonstrate skills with Python3 to organize and analyze large datasets. The dataset used in this project was of U.S Medical Insurance Costs, which displays the age, sex, bmi, child_status, smoking_status, region and charges of a multitude of individuals.

In [363]:
#import raw medical insurance dataset & seperate data columns for analysis
import csv
with open("insurance.csv") as insurance_data:
    csv_dict = csv.DictReader(insurance_data)
    age = []
    sex = []
    bmi = []
    children = []
    smoker = []
    region = []
    charges = []
    for row in csv_dict:
        age.append(row["age"])
        sex.append(row["sex"])
        bmi.append(row["bmi"])
        children.append(row["children"])
        smoker.append(row["smoker"])
        region.append(row["region"])
        charges.append(row["charges"])
                              

        
        



# Age Analysis

In [364]:
#finding the frequency of ages in the data set
def count_ages(age_dataset):
    occurence = {age: age_dataset.count(age) for age in age_dataset}
    print("Individuals Ages with Frequency:")
    for key, value in sorted(occurence.items()):
        print("Age:", key, ":", "Frequency:", value)
count_ages(age)

#find average age from the dataset
def calculate_average_age(age_dataset):
    total_age = 0
    for ages in age:
        total_age += int(ages)
    average_age = round(total_age / len(age))
    print("The average age of individuals in the dataset is " + str(average_age) + " years old.") 
calculate_average_age(age)

Individuals Ages with Frequency:
Age: 18 : Frequency: 69
Age: 19 : Frequency: 68
Age: 20 : Frequency: 29
Age: 21 : Frequency: 28
Age: 22 : Frequency: 28
Age: 23 : Frequency: 28
Age: 24 : Frequency: 28
Age: 25 : Frequency: 28
Age: 26 : Frequency: 28
Age: 27 : Frequency: 28
Age: 28 : Frequency: 28
Age: 29 : Frequency: 27
Age: 30 : Frequency: 27
Age: 31 : Frequency: 27
Age: 32 : Frequency: 26
Age: 33 : Frequency: 26
Age: 34 : Frequency: 26
Age: 35 : Frequency: 25
Age: 36 : Frequency: 25
Age: 37 : Frequency: 25
Age: 38 : Frequency: 25
Age: 39 : Frequency: 25
Age: 40 : Frequency: 27
Age: 41 : Frequency: 27
Age: 42 : Frequency: 27
Age: 43 : Frequency: 27
Age: 44 : Frequency: 27
Age: 45 : Frequency: 29
Age: 46 : Frequency: 29
Age: 47 : Frequency: 29
Age: 48 : Frequency: 29
Age: 49 : Frequency: 28
Age: 50 : Frequency: 29
Age: 51 : Frequency: 29
Age: 52 : Frequency: 29
Age: 53 : Frequency: 28
Age: 54 : Frequency: 28
Age: 55 : Frequency: 26
Age: 56 : Frequency: 26
Age: 57 : Frequency: 26
Age: 58

# Sex Analysis

In [365]:
#finding the proportion of males & females
def ratio_male_female(sex_dataset):
    male = sex.count("male")
    female = sex.count("female")
    percentage_male = round(male / len(sex) * 100, 2)
    percentage_female = round(female / len(sex) * 100, 2)
    print("Individuals in the dataset consist of " + str(male) + " (" + str(percentage_male) + "%) males, and "
          + str(female) + " (" + str(percentage_female) + "%) females.")
ratio_male_female(sex)

#average charges by sex
def charges_by_sex(sex_dataset, charges_dataset):
    total_male_charges = 0
    total_female_charges = 0
    for c in list(zip(sex_dataset, charges_dataset)):
        if c[0] == "male":
            total_male_charges += round(float(c[1]), 2)
        elif c[0] == "female":
            total_female_charges += round(float(c[1]), 2)
    average_male_charges = round(total_male_charges / sex.count("male"))
    average_female_charges = round(total_female_charges / sex.count("female"))
    print("The average male in the dataset pays " + str(average_male_charges) 
          +" dollars in medical insurance costs, while the average female pays " + str(average_female_charges) + " dollars.")
charges_by_sex(sex, charges)

Individuals in the dataset consist of 676 (50.52%) males, and 662 (49.48%) females.
The average male in the dataset pays 13957 dollars in medical insurance costs, while the average female pays 12570 dollars.


# BMI Analysis

In [366]:
#calculate average bmi
def calculate_average_bmi(bmi_dataset):
    total_bmi = 0
    for b in bmi_dataset:
        total_bmi += float(b)
    average_bmi = round(total_bmi / len(bmi), 2)
    print("The average bmi of all individuals in the dataset is " + str(average_bmi) + ".")
calculate_average_bmi(bmi)

#average bmi by sex
def calculate_average_bmi_by_sex(sex_dataset, bmi_dataset):
    total_male_bmi = 0
    total_female_bmi = 0
    for b in list(zip(sex_dataset, bmi_dataset)):
        if b[0] == "male":
            total_male_bmi += float(b[1])
        elif b[0] == "female":
            total_female_bmi += float(b[1])
    average_male_bmi = round(float(total_male_bmi / sex.count("male")), 2)
    average_female_bmi = round(float(total_female_bmi / sex.count("female")), 2)
    print("The average bmi of males in the dataset is " + str(average_male_bmi) + ", while the average bmi of females is " + str(average_female_bmi) + ".")
calculate_average_bmi_by_sex(sex, bmi)

The average bmi of all individuals in the dataset is 30.66.
The average bmi of males in the dataset is 30.94, while the average bmi of females is 30.38.


# Children Analysis

In [367]:
#determining average age of individuals with atleast 1 child
def average_age_of_individuals_with_children(ages, children):
    number_of_individuals_with_children = 0
    total_age_of_individuals_with_children = 0
    for c in list(zip(age, children)):
        if c[1] >= "1":
            number_of_individuals_with_children += 1
            total_age_of_individuals_with_children += int(c[0])
    average_age_of_individuals_with_children = round(total_age_of_individuals_with_children / len(age))
    print("Of the " + str(len(age)) + " individuals in the dataset, " + str(number_of_individuals_with_children)\
          + " of them have atleast one child. The average age for this group is " + str(average_age_of_individuals_with_children)\
          + " years old.") 
average_age_of_individuals_with_children(age, children)

#determine how having atleast 1 child effects medical costs
def child_effect_on_medical_costs(children_database, charges_database):
    no_child_charges = 0
    child_charges = 0
    for c in list(zip(children, charges)):
        if c[0] == "0":
            no_child_charges += float(c[1])
        elif c[0] >= "1":
            child_charges += float(c[1])
    no_child_average = round(no_child_charges / len(children),2)
    child_average = round(child_charges / len(children),2)
    difference = abs(no_child_average - child_average)
    print("The average medical costs for individuals with no children is $" + str(no_child_average)\
          + ". The average medical costs for individuals with atleast one child is $" + str(child_average) + ". The difference between these two categories is $" + str(difference) + ".")
child_effect_on_medical_costs(children, charges)

Of the 1338 individuals in the dataset, 764 of them have atleast one child. The average age for this group is 23 years old.
The average medical costs for individuals with no children is $5304.99. The average medical costs for individuals with atleast one child is $7965.44. The difference between these two categories is $2660.45.


# Smoker Status Analysis

In [368]:
#determine frequency of smokers in the dataset
def find_smoker_status(smoker_dataset):
    total = len(smoker)
    smokers = smoker.count("yes")
    smoker_percent = round(smokers / total * 100,2)
    non_smoker = smoker.count("no")
    non_smoker_percent = round(non_smoker / total * 100,2)
    print("Of the " + str(total) + " individuals in the dataset, " + str(non_smoker) + " (" + str(non_smoker_percent)\
          + "%)" + " are non-smokers, leaving " + str(smokers) + " (" + str(smoker_percent) + "%) " + "smokers.")
find_smoker_status(smoker)

#analyze difference in insurance cost between smokers and nonsmokers
def compare_smoker_status(smoker_dataset):
    smoker_total_cost = 0
    non_smoker_total_cost = 0
    for status in list(zip(smoker, charges)):
        if status[0] == "yes":
            smoker_total_cost += round(float(status[1]))
        elif status[0] == "no":
            non_smoker_total_cost += round(float(status[1]))
    average_smoker_charges = round(smoker_total_cost / smoker.count("yes"), 2)
    average_non_smoker_charges = round(non_smoker_total_cost / smoker.count("no"), 2)
    print("The average medical insurance costs of smokers in this dataset is $" + str(average_smoker_charges) + " dollars.")
    print("The average medical insurance costs of non-smokers in this data is $" + str(average_non_smoker_charges) + " dollars.")
compare_smoker_status(smoker)

Of the 1338 individuals in the dataset, 1064 (79.52%) are non-smokers, leaving 274 (20.48%) smokers.
The average medical insurance costs of smokers in this dataset is $32050.23 dollars.
The average medical insurance costs of non-smokers in this data is $8434.26 dollars.


# Region Analysis

In [369]:
#count of regions & most common region
import operator
def count_regions(regions_dataset):
    occurence = {region: regions_dataset.count(region) for region in regions_dataset}
    most_populous = sorted(occurence.items(), key=operator.itemgetter(1), reverse = True)[0]
    print("Individuals from the dataset reside in the follow regions " + str(occurence)\
          + ", making the most common region the " + str(most_populous) + ".")
count_regions(region)

#Difference in medical insurance costs by region
def calculate_regional_difference(region_dataset, charges_dataset):
    southwest_total = 0
    southeast_total = 0
    northwest_total = 0
    northeast_total = 0
    for r in list(zip(region_dataset, charges_dataset)):
        if r[0] == "southwest":
            southwest_total += float(r[1])
        elif r[0] == "southeast":
            southeast_total += float(r[1])
        elif r[0] == "northwest":
            northwest_total += float(r[1])
        elif r[0] == "northeast":
            northeast_total += float(r[1])
    southwest_average = round(southwest_total / region.count("southwest"),2)
    southeast_average = round(southeast_total / region.count("southeast"),2)
    northwest_average = round(northwest_total / region.count("northwest"),2)
    northeast_average = round(northeast_total / region.count("northeast"),2)
    region_dict = {"Southwest": southwest_average, "Southeast": southeast_average,\
                   "Northwest": northwest_average, "Northeast": northeast_average}
    print("The average medical insurance costs by region is as follows " + str(region_dict))
calculate_regional_difference(region, charges)

Individuals from the dataset reside in the follow regions {'southwest': 325, 'southeast': 364, 'northwest': 325, 'northeast': 324}, making the most common region the ('southeast', 364).
The average medical insurance costs by region is as follows {'Southwest': 12346.94, 'Southeast': 14735.41, 'Northwest': 12417.58, 'Northeast': 13406.38}
