# U.S. Medical Insurance Costs (Portfolio Project)

In [141]:
# import csv library
import csv

In [142]:
with open('insurance.csv') as insurance_file:
    insurance_data = csv.DictReader(insurance_file)
    age = []
    sex = []
    bmi = []
    children = []
    smoker = []
    region = []
    charges = []
    for column in insurance_data:
        age_value = column['age']
        if not age_value.isdigit():
            print("Non-numeric age value detected:", age_value)
        age.append(int(age_value))  # Convert age to integer before appending
        sex.append(column['sex'])
        bmi.append(float(column['bmi']))  # Convert bmi to float before appending
        children.append(int(column['children']))  # Convert children to integer before appending
        smoker.append(column['smoker'])
        region.append(column['region'])
        charges.append(float(column['charges']))  # Convert charges to float before appending

In [143]:
total_population = len(age)
print(f"{total_population:n} total records")

1,338 total records


In [144]:
print("--------Average Patient Age----------")

average_age = sum(age) / len(age)
print(f"The average patient age is {average_age:.1f} years.")

print()

--------Average Patient Age----------
The average patient age is 39.2 years.



In [151]:
print("--------Count of male and female patients----------")

def percentage(number):
    return round(number / total_population, 2)*100

male_count = sex.count('male')
female_count = sex.count('female')

male_count_prc = percentage(male_count)
female_count_prc = percentage(female_count)

print('There are {male_count} men. ({male_count_prc}% of the total)'.format(male_count=male_count, male_count_prc=male_count_prc))
print('There are {female_count} women, which is {female_count_prc}% of the total.'.format(female_count=female_count, female_count_prc=female_count_prc))
print()

--------Count of male and female patients----------
There are 676 men. (51.0% of the total)
There are 662 women, which is 49.0% of the total.



In [146]:
# Return a dictionary of the unique values in 'region'
print("--------Unique Values in 'region'----------")

unique_regions = set(region)
print(unique_regions)

#  count by region
region_count = {}

for region_value in region:
    if region_value not in region_count:
        region_count[region_value] = 1
    else:
        region_count[region_value] += 1

print(region_count)
print()


--------Unique Values in 'region'----------
{'southwest', 'southeast', 'northwest', 'northeast'}
{'southwest': 325, 'southeast': 364, 'northwest': 325, 'northeast': 324}



In [147]:
print("--------Average Yearly Medical Insurance Charges----------")

import locale
# Set the locale format for US dollars
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

average_charges = sum(charges) / len(charges)
formatted_cost = locale.currency(average_charges, grouping=True)

print(f"The average yearly medical insurance charge is {formatted_cost}.")

print()

--------Average Yearly Medical Insurance Charges----------
The average yearly medical insurance charge is $13,270.42.



In [148]:
# Create DataFrame from csv using pandas

import pandas as pd
# import numpy as np

filename = 'insurance.csv'
insurance_df = pd.read_csv(filename)

# # Convert DataFrame to data set list of dictionaries
# Each dictionary in the list represents a row in the DataFrame. 
# The keys of the dictionaries are the column names, and the values are the corresponding values in each row.
insurance_dict = insurance_df.to_dict(orient='records')

# Create a DataFrame
df = pd.DataFrame(insurance_df)

print("Descriptive Statistics with Custom Percentiles")
description = df.describe(percentiles=[0.1, 0.9])
print(description)
print()

print("Method 1 for creating a dataframe list object")
print(insurance_df)
print()

print("Method 2 for creating a dataframe")
print(df)
print()

# print("View of all data in list object")
# print(insurance_dict)

# misc df stats that can be used for Exploratory Data Analysis
mean_age = df['age'].mean()
print(f"The mean age is {mean_age:.1f}")

mean_charges = df['charges'].mean()
mean_charges_currency = f"${mean_charges:,.2f}"
print(f"The mean charges are {mean_charges_currency}")

median_charges = df['charges'].median()
median_charges_currency = f"${median_charges:,.2f}"
print(f"The median charges are {median_charges_currency}")

print()
sex_count=df["sex"].value_counts()
print(sex_count)

print()
smoker_count = df["smoker"].value_counts()
print(smoker_count)

Descriptive Statistics with Custom Percentiles
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
10%      19.000000    22.990000     0.000000   2346.533615
50%      39.000000    30.400000     1.000000   9382.033000
90%      59.000000    38.619500     3.000000  34831.719700
max      64.000000    53.130000     5.000000  63770.428010

Method 1 for creating a dataframe list object
      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     

In [149]:
class PatientSummarys:
    
    # method that counts Average Age by Gender from insurance_dict
    def analyze_ages(insurance_dict):
        print("--------Average Age by Gender----------")

        # Dictionary to store the total age and count for each gender
        age_by_gender = {'female': {'total': 0, 'count': 0}, 
                         'male': {'total': 0, 'count': 0}}

        # Calculate total age and count for each gender
        for person in insurance_dict:
            gender = person['sex']
            age = person['age']

            age_by_gender[gender]['total'] += age
            age_by_gender[gender]['count'] += 1

        # Calculate average age for each gender
        for gender, values in age_by_gender.items():
            if values['count'] > 0:
                average_age = values['total'] / values['count']
                num_form = "{:,.0f}"
                print(f"The average age for {gender} is: {average_age:.1f}")
            else:
                print(f"No data available for {gender}")
        print()
    
    # method that returns Average Yearly Medical Insurance Charges by Region from insurance_dict
    def analyze_regions(insurance_dict):
        print("------Average Yearly Medical Insurance Charges by Region------")
        import locale
        # Set the locale format for US dollars
        locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

        # Dictionary to store the total charges and patient count for each region
        charges_by_region = {'northwest': {'total': 0, 'count': 0}, 'southwest': {'total': 0, 'count': 0}, 'northeast': {'total': 0, 'count': 0}, 'southeast': {'total': 0, 'count': 0}}

        # Calculate total charges and count for each region
        for patient in insurance_dict:
            region = patient['region']
            charges = patient['charges']

            charges_by_region[region]['total'] += charges
            charges_by_region[region]['count'] += 1

        # Calculate average charge for each region
        for region, values in charges_by_region.items():
            if values['count'] > 0:
                average_charges = values['total'] / values['count']
                # Format average_cost as US dollars with two decimal places
                formatted_cost = locale.currency(average_charges, grouping=True)
                print(f"The average annual charge for a patient from the {region} region is: {formatted_cost}")
            else:
                print(f"No data available for {region}")
        print()

    # method that returns count of patients by Region from insurance_dict
    def count_patients_by_regions(insurance_dict):
        print("------Count of patients by region------")
        
        # Dictionary to store the patient count for each region
        charges_by_region = {'northwest': {'count': 0}, 'southwest': {'count': 0}, 'northeast': {'count': 0}, 'southeast': {'count': 0}}

        # Calculate count for each region
        for patient in insurance_dict:
            region = patient['region']
            charges = patient['charges']

            charges_by_region[region]['count'] += 1

        for region, values in charges_by_region.items():
            if values['count'] > 0:
                patient_count = values['count']
                print(f"The number of patients from the {region} region is: {patient_count}")
            else:
                print(f"No data available for {region}")
        print()

    # method that returns Total Charges by Region from insurance_dict
    def charges_by_region(insurance_dict):
        print("------Total Yearly Medical Insurance Charges by Region------")

        df = pd.DataFrame(insurance_dict)
        summary = df.groupby('region').agg({'charges': 'sum'})

        # Format charges into US$
        summary['charges'] = summary['charges'].map('${:,.0f}'.format)

        print(summary)
        print()       


    # method that returns Average Yearly Medical Insurance Charges by Smoker Status from insurance_dict
    def charges_by_smoker(insurance_dict):
        print("------Average Yearly Medical Insurance Charges by Smoker Status------")
        
        # Dictionary to store the total charges and patient count for each smoker statuys
        charges_by_smoker = {'yes': {'total': 0, 'count': 0}, 'no': {'total': 0, 'count': 0}}

        # Calculate total charges and count for each smoker status
        for patient in insurance_dict:
            smoker = patient['smoker']
            charges = patient['charges']

            charges_by_smoker[smoker]['total'] += charges
            charges_by_smoker[smoker]['count'] += 1

        # Calculate average charge for each smoker status
        for smoker, values in charges_by_smoker.items():
            if values['count'] > 0:
                average_charges = values['total'] / values['count']
                # Format average_cost as US dollars with two decimal places
                formatted_cost = locale.currency(average_charges, grouping=True)
                print(f"The average annual charge for a patient with smoker status of {smoker} is: {formatted_cost}")
            else:
                print(f"No data available for {smoker}")
        print()
        
    # method that returns number of patients by Smoker Status from insurance_dict
    def count_by_smoker(insurance_dict):

        # Define ANSI escape codes for bold text
        bold_start = "\033[1m"
        bold_end = "\033[0m"

        # Print the string in bold
        print(f"{bold_start}---Number of patients by smoker status---{bold_end}")

        # Dictionary to store the total charges and patient count for each smoker statuys
        charges_by_smoker = {'yes': {'count': 0}, 'no': {'count': 0}}

        # Calculate total charges and count for each smoker status
        for patient in insurance_dict:
            smoker = patient['smoker']
            charges = patient['charges']

            charges_by_smoker[smoker]['count'] += 1
        
        for smoker, values in charges_by_smoker.items():
            if values['count'] > 0:
                patient_count = values['count']
                print(f"There are {bold_start}{patient_count}{bold_end} patients with the smoker status {bold_start}{smoker}{bold_end}.")
        #     else:
        #         print(f"No data available for {bold_start}{smoker}{bold_end}")
        print()

    # method that Averages Yearly Medical Insurance Charges by BMI Grouping and by Sex from insurance_dict
    def charges_by_bmi_sex(insurance_dict):
        
        print("------Average Yearly Medical Insurance Charges by BMI and by Sex------")

        df = pd.DataFrame(insurance_dict)

        # Define age bins
        bmi_bins = [0, 25.96, 35.96, 45.96, 55.96]

        # Create age groups
        df['BMI'] = pd.cut(df['bmi'], bins=bmi_bins, labels=['<25.96', '25.96-35.96', '35.96-45.96', '>45.96'])

        # Group by 'bmiGroup' and 'Sex' and calculate the average charges
        average_charges_by_bmi = df.groupby(['BMI','sex'])['charges'].mean()

        # Format charges into US$
        average_charges_by_bmi = average_charges_by_bmi.map('${:,.0f}'.format)

        print(average_charges_by_bmi)

        print()

    # method that Averages Yearly Medical Insurance Charges by BMI Grouping from insurance_dict
    def charges_by_bmi(insurance_dict):

        print("------Average Yearly Medical Insurance Charges by BMI------")

        df = pd.DataFrame(insurance_dict)

        # Define age bins
        bmi_bins = [0, 25.96, 35.96, 45.96, 55.96]

        # Create age groups
        df['BMI'] = pd.cut(df['bmi'], bins=bmi_bins, labels=['<25.96', '25.96-35.96', '35.96-45.96', '>45.96'])

        # Group by 'bmiGroup' and 'Sex' and calculate the average charges
        charges_by_bmi = df.groupby(['BMI'])['charges'].mean()

        # Format charges into US$
        charges_by_bmi = charges_by_bmi.map('${:,.0f}'.format)

        print(charges_by_bmi)

In [150]:
PatientSummarys.analyze_ages(insurance_dict)
PatientSummarys.analyze_regions(insurance_dict)
PatientSummarys.count_patients_by_regions(insurance_dict)
PatientSummarys.charges_by_region(insurance_dict)
PatientSummarys.charges_by_smoker(insurance_dict)
PatientSummarys.count_by_smoker(insurance_dict)
PatientSummarys.charges_by_bmi_sex(insurance_dict)
PatientSummarys.charges_by_bmi(insurance_dict)

--------Average Age by Gender----------
The average age for female is: 39.5
The average age for male is: 38.9

------Average Yearly Medical Insurance Charges by Region------
The average annual charge for a patient from the northwest region is: $12,417.58
The average annual charge for a patient from the southwest region is: $12,346.94
The average annual charge for a patient from the northeast region is: $13,406.38
The average annual charge for a patient from the southeast region is: $14,735.41

------Count of patients by region------
The number of patients from the northwest region is: 325
The number of patients from the southwest region is: 325
The number of patients from the northeast region is: 324
The number of patients from the southeast region is: 364

------Total Yearly Medical Insurance Charges by Region------
              charges
region               
northeast  $4,343,669
northwest  $4,035,712
southeast  $5,363,690
southwest  $4,012,755

------Average Yearly Medical Insurance