# U.S. Medical Insurance Costs

In [250]:
import csv
from collections import Counter





Creating empty lists for each of the columns in csv data.

In [251]:
ages = []
sexes = []
bmis = []
num_children = []
smoker_statuses = []
regions = []
insurance_costs = []

We are going to create a function that will allow to get the data for each column and fill in the created lists.

In [252]:
def csv_data_read(lst, csv_file, column):
    with open(csv_file) as csv_data:
        csv_reader=csv.DictReader(csv_data)
        for row in csv_reader:
            lst.append(row[column])
        return lst

        

In [253]:
csv_data_read(ages, "insurance.csv", "age");
csv_data_read(sexes, "insurance.csv", "sex");
csv_data_read(bmis, "insurance.csv", "bmi");
csv_data_read(num_children, "insurance.csv", "children");
csv_data_read(smoker_statuses, "insurance.csv", "smoker");
csv_data_read(regions, "insurance.csv", "region");
csv_data_read(insurance_costs, "insurance.csv", "charges");

We need to clean our data by converting columns with sexes and smoker statuses to numbers that will facilitate our analysis.

In [254]:
sexes = [ "1" if i == "male" else "0" for i in sexes]


In [255]:
smoker_statuses = [ "1" if i == "yes" else "0" for i in smoker_statuses]


We convert also the insurance_costs list to list of floats

In [256]:
insurance_costs=[float(i) for i in insurance_costs]


We are going to create a class P_Data (Patients Data) where we'll store all the information and the functions necessary to analyze data.

In [257]:
class P_Data:
    def __init__(self, p_ages, p_sexes, p_bmis, p_children, p_smoker, p_regions, p_insurance_costs):
        self.p_ages=p_ages
        self.p_sexes=p_sexes
        self.p_bmis=p_bmis
        self.p_children=p_children
        self.p_smoker=p_smoker
        self.p_regions=p_regions
        self.p_insurance_costs=p_insurance_costs
    def total_patients(self):
        #we check for how many entries there are in the dataset
        total_entries=0
        for l in self.p_ages:
            total_entries+=1
        return "The total of patients in data sample is: " + str(total_entries)
    def p_dict(self):
        p_info={}
        p_info["Age"]=self.p_ages
        p_info["Sex"]=self.p_sexes
        p_info["BMI"]=self.p_bmis
        p_info["Children"]=self.p_children
        p_info["Smoker"]=self.p_smoker
        p_info["Region"]=self.p_regions
        p_info["Insurance Charges"]=self.p_insurance_costs
        return p_info
    def mean_age(self):
        #the function will calculate the average age in the data
        total=0
        for a in self.p_ages:
            total+=int(a)
        return "The mean age of the patients in data sample is: " + str(round(total/len(self.p_ages), 2))
    def sex_distribution(self):
        #the function will calculate how many males/females are in dataset
        male_tot=0
        female_tot=0
        for s in self.p_sexes:
            if s=="1":
                male_tot+=1
            elif s=="0":
                female_tot+=1
        return f"The total of male patients is: {male_tot}. The total of female patients is: {female_tot}"
    def mean_bmi(self):
        #the function will calculate the average bmi in the data
        bmi_tot=0
        for b in self.p_bmis:
            bmi_tot+=float(b)
        return "The mean BMI of the patients in data sample is: " + str(round(bmi_tot/len(self.p_bmis), 2))
    def smoker_distribution(self):
        #the function will calculate how many smokers/non smoker are in data
        smoker_tot=0
        non_smoker_tot=0
        for s in self.p_smoker:
            if s=="1":
                smoker_tot+=1
            elif s=="0":
                non_smoker_tot+=1
        return f"The total of patients in data sample that are smokers is: {smoker_tot}. The total of non smokers is: {non_smoker_tot}."
    def unique_regions(self):
        #finding the distribution of patients from different regions
        u_reg=Counter(self.p_regions)
        return f"Patients are from {len(u_reg)} unique regions. And they are distributed as follows: "+ str(dict(u_reg))
    def number_of_children(self):
        #how many children patients from the data have
        unique_entries=Counter(self.p_children)
        return f"Patients have different number of children. And they are distributed as follows: "+ str(dict(unique_entries))
    def mean_charge(self):
        #the function will calculate the average insurance cost of the patients in data sample
        total=0
        for m in self.p_insurance_costs:
            total+=m
        return f"The mean insurance charge for patients in data sample is: {str(round(total/len(self.p_insurance_costs), 2))} dollars."
   

    

Now we are going to use all various functions from the defined class to analyse the data and printing out the results.

In [258]:
data_to_analyze=P_Data(ages, sexes, bmis, num_children, smoker_statuses, regions, insurance_costs)

In [259]:
data_to_analyze.total_patients()

'The total of patients in data sample is: 1338'

In [260]:
data_to_analyze.sex_distribution()

'The total of male patients is: 676. The total of female patients is: 662'

In [261]:
data_to_analyze.mean_age()

'The mean age of the patients in data sample is: 39.21'

In [262]:
data_to_analyze.mean_bmi()

'The mean BMI of the patients in data sample is: 30.66'

In [263]:
data_to_analyze.number_of_children()

"Patients have different number of children. And they are distributed as follows: {'0': 574, '1': 324, '3': 157, '2': 240, '5': 18, '4': 25}"

In [264]:
data_to_analyze.smoker_distribution()

'The total of patients in data sample that are smokers is: 274. The total of non smokers is: 1064.'

In [265]:
data_to_analyze.unique_regions()

"Patients are from 4 unique regions. And they are distributed as follows: {'southwest': 325, 'southeast': 364, 'northwest': 325, 'northeast': 324}"

In [266]:
data_to_analyze.mean_charge()

'The mean insurance charge for patients in data sample is: 13270.42 dollars.'

We can also compare the costs that have males vs females and smokers vs non smokers in this data set.
We write a function that will be used for both comparisons.

In [267]:
def comparing_costs(lst_variable, lst_costs):
    comparing_tuple=zip(lst_variable, lst_costs)
    total_0=0
    total_1=0
    count_0=0
    count_1=0
    for k, v in comparing_tuple:
        if k=="0":
            total_0+=float(v)
            count_0+=1
        elif k=="1":
            total_1+=float(v)
            count_1+=1
    return round(total_0/count_0, 2), round(total_1/count_1, 2), "The difference is " + str(round(total_0/count_0, 2)-round(total_1/count_1)) + " dollars."
    
    

In [268]:
comparing_costs(sexes, insurance_costs)

(12569.58, 13956.75, 'The difference is -1387.42 dollars.')

In [269]:
comparing_costs(smoker_statuses, insurance_costs)

(8434.27, 32050.23, 'The difference is -23615.73 dollars.')

As we see the difference in charges between male and female isn't significant comparing to the difference between smoker and non smoker patients. 

We can also look for how many smokers are among males vs females:

In [270]:
comparing_smokers=zip(smoker_statuses, sexes)
total_male_smokers=0
total_female_smokers=0
for k, v in comparing_smokers:
    if k=="1" and v=="1":
        total_male_smokers+=1
    elif k=="1" and v=="0":
        total_female_smokers+=1
print(f"There are {total_male_smokers} male smokers in the data set vs {total_female_smokers} female smokers.")
        

There are 159 male smokers in the data set vs 115 female smokers.


Another step in the analysis will be the comparison that looks at various bmi's and the relative charges:

In [271]:
comparing_bmi_charges=zip(bmis, insurance_costs)
underweight_patients_count=0
underweight_charges=0
healthy_count=0
healthy_charges=0
overweight_count=0
overweight_charges=0
obese_count=0
obese_charges=0
for k, v in comparing_bmi_charges:
    k=float(k)
    if k<18.5:
        underweight_charges+=float(v)
        underweight_patients_count+=1
    elif k>=18.5 and k<25:
        healthy_charges+=float(v)
        healthy_count+=1
    elif k>=25 and k<30:
        overweight_charges+=float(v)
        overweight_count+=1
    else:
        obese_charges+=float(v)
        obese_count+=1

print(f"There are {underweight_patients_count} patients in the underweight category and their average insurance charge is {round(underweight_charges/underweight_patients_count, 2)}.")
print(f"There are {healthy_count} patients in the healthy category and their average insurance charge is {round(healthy_charges/healthy_count, 2)}.")  
print(f"There are {overweight_count} patients in the overweight category and their average insurance charge is {round(overweight_charges/overweight_count, 2)}.")  
print(f"There are {obese_count} patients in the obese category and their average insurance charge is {round(obese_charges/obese_count, 2)}.")  


There are 20 patients in the underweight category and their average insurance charge is 8852.2.
There are 225 patients in the healthy category and their average insurance charge is 10409.34.
There are 386 patients in the overweight category and their average insurance charge is 10987.51.
There are 707 patients in the obese category and their average insurance charge is 15552.34.


Clearly the BMI index has an influence on the insurance costs of patients, alongside with smoker status.

CONCLUSION

To further explore the correlations between various factors we would need to use statistical methods such as Pearson's coefficient. In order to do this though we would need not only to apply the formula, but first it would be necessary to check if our data follows the normal distribution and if not normilize it. This is for the moment well beyond my knowledge and capabilities. For this reason I can't reach any meaningful conclusions from this analysis other than general statistical outputs mentioned earlier.