# This program creates stats report for insurance.csv

In [31]:
#import packages
import pandas as pd
import statsmodels.formula.api as smf

#Read csv as DataFrame
ins_df = pd.read_csv('insurance.csv')
#Drop duplicates & Null & filter out age>60
ins_df = ins_df.drop_duplicates()
ins_df = ins_df.dropna()
ins_df = ins_df[(ins_df.age <= 60)]

print('\ndataset info:\n', ins_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1246 entries, 0 to 1336
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1246 non-null   int64  
 1   sex       1246 non-null   object 
 2   bmi       1246 non-null   float64
 3   children  1246 non-null   int64  
 4   smoker    1246 non-null   object 
 5   region    1246 non-null   object 
 6   charges   1246 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 77.9+ KB

dataset info:
 None


#### Pandas general stats & correlation

In [72]:
#This creates linear regression and description of the original dataset
print(ins_df.describe().round(2))
new_df = ins_df[['charges','age','bmi']].copy()
print("\nCorrelation\n",new_df.corr().round(2))


           age      bmi  children   charges
count  1246.00  1246.00   1246.00   1246.00
mean     37.52    30.53      1.13  12710.62
std      13.01     6.11      1.21  11934.52
min      18.00    15.96      0.00   1121.87
25%      26.00    26.12      0.00   4501.42
50%      37.50    30.20      1.00   8610.76
75%      49.00    34.42      2.00  16281.60
max      60.00    53.13      5.00  63770.43

Correlation
          charges   age   bmi
charges     1.00  0.26  0.20
age         0.26  1.00  0.09
bmi         0.20  0.09  1.00


In [73]:
#Create dummy variable and save as csv file
dm_df = pd.get_dummies(ins_df, columns=['smoker'],drop_first=True)
print('\ndummy variable\n', dm_df.head().round(2))
dm_df.to_csv('insurance cleaned with dummies')


dummy variable
    age     sex    bmi  children     region   charges  smoker_yes
0   19  female  27.90         0  Southwest  16884.92           1
1   18    male  33.77         1  Southeast   1725.55           0
2   28    male  33.00         3  Southeast   4449.46           0
3   33    male  22.70         0  Northwest  21984.47           0
4   32    male  28.88         0  Northwest   3866.86           0


In [5]:
#OLS Linear regression using dm_df.csv 
dm_df = pd.read_csv('insurance cleaned with dummies')
model = smf.ols('''smoker_yes~charges+age''',data = dm_df).fit()
print('\nModel summary: ', model.summary())



Model summary:                              OLS Regression Results                            
Dep. Variable:             smoker_yes   R-squared:                       0.691
Model:                            OLS   Adj. R-squared:                  0.691
Method:                 Least Squares   F-statistic:                     1392.
Date:                Mon, 07 Nov 2022   Prob (F-statistic):          4.59e-318
Time:                        15:27:44   Log-Likelihood:                 100.95
No. Observations:                1246   AIC:                            -195.9
Df Residuals:                    1243   BIC:                            -180.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1424      0.019   

In [16]:
#Using Logistic Regression to Predict If a Respondent Is Smoker
model = smf.logit(formula='smoker_yes~charges'+
                '+bmi+age',
                data=dm_df).fit()
print('\nLogit Model summary: ',model.summary())
print('\nThe higher pseudo R-squared indicates which model better predicts the outcome.')

Optimization terminated successfully.
         Current function value: 0.117579
         Iterations 9

Logit Model summary:                             Logit Regression Results                           
Dep. Variable:             smoker_yes   No. Observations:                 1246
Model:                          Logit   Df Residuals:                     1242
Method:                           MLE   Df Model:                            3
Date:                Mon, 07 Nov 2022   Pseudo R-squ.:                  0.7665
Time:                        15:33:43   Log-Likelihood:                -146.50
converged:                       True   LL-Null:                       -627.36
Covariance Type:            nonrobust   LLR p-value:                3.625e-208
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.1798      1.054      4.915      0.000       3.114       7.245
charge

In [70]:
#Group by region, avg charges & BMI
ins_df.region = ins_df.region.str.capitalize()
df = ins_df.groupby('region')['charges','bmi'].mean().round(2).reset_index()
print('\nAvg charges, bmi and smoker by region\n',df)
print('\nSoutheast has the highest bmi & charges among all regions.')


Avg charges, bmi and smoker by region
       region   charges    bmi
0  Northeast  12983.71  29.02
1  Northwest  11952.40  29.14
2  Southeast  14093.93  33.30
3  Southwest  11643.10  30.33

Southeast has the highest bmi & charges among all regions.


  df = ins_df.groupby('region')['charges','bmi'].mean().round(2).reset_index()


In [69]:
#Group by region,total smokers
df2 = ins_df.groupby('region')['smoker'].apply(lambda x:(x=='yes').sum()).reset_index()
print('\n# of smoker by region\n',df2)
print('\nSoutheast has the most smokers in this dataset.')


# of smoker by region
       region  smoker
0  Northeast      63
1  Northwest      53
2  Southeast      83
3  Southwest      53

Southeast has the most smokers in this dataset.


In [67]:
#Avg Charges of smoker/non-smoker
df3 = ins_df.groupby('smoker')['charges'].mean().round(2).reset_index()
print('\nAvg charges of smoker\n',df3)
print("\nSmokers' charges is significantly higher.")


Avg charges of smoker
   smoker   charges
0     no   7959.88
1    yes  31449.65

Smokers' charges is significantly higher.


In [74]:
#####categorize the bmi/ age groups into df
ins_df.bmi = pd.cut(ins_df.bmi, bins=[0, 18.5, 25, 30, 100], include_lowest=True, labels=['underweight', 'healthy weight', 'overweight', 'obesity'])
ins_df.age = pd.cut(ins_df.age, bins=[0, 27, 45, 57, 60], include_lowest=True, labels=['Gen Z','Millennials', 'Gen X', 'Baby Boomers'])
print(ins_df.head().round(2))

           age     sex             bmi  children smoker     region   charges
0        Gen Z  female      overweight         0    yes  Southwest  16884.92
1        Gen Z    male         obesity         1     no  Southeast   1725.55
2  Millennials    male         obesity         3     no  Southeast   4449.46
3  Millennials    male  healthy weight         0     no  Northwest  21984.47
4  Millennials    male      overweight         0     no  Northwest   3866.86


In [12]:
#num of smoker by age group
df = ins_df.groupby('age')['smoker'].apply(lambda x:(x=='yes').sum()).reset_index()
df =  df.sort_values('smoker', ascending=False)
print('\n#of smoker by age group\n', df)


#of smoker by age group
             age  smoker
1   Millennials     105
0         Gen Z      77
2         Gen X      60
3  Baby Boomers      10


In [13]:
#Average charges by age group and smoker
df2 = ins_df.groupby(['age', 'smoker'])["charges"].apply(lambda x : x.astype(int).mean()).reset_index()
df2.charges = df2.charges.round(2)
print('\nAverage charges by age group and smoker\n', df2)


Average charges by age group and smoker
             age smoker   charges
0         Gen Z     no   4146.28
1         Gen Z    yes  27456.84
2   Millennials     no   7014.60
3   Millennials    yes  31042.10
4         Gen X     no  11766.75
5         Gen X    yes  35179.78
6  Baby Boomers     no  14032.65
7  Baby Boomers    yes  44079.80


### Supervised Machine Learning

In [14]:
#ML to predict how many insuree is a smoker
#import packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model_df = pd.read_csv('insurance cleaned with dummies')

#double '[[]]' output a Pandas DataFrame
model_y = model_df[['smoker_yes']] # Specify column for dependent variable
model_x = model_df[['age','bmi', 'charges']]  # Specify columns for independent variable
#print('The data type:',type(model_y),type(model_x))
print(model_y['smoker_yes'].value_counts())
x_train, x_test, y_train, y_test = train_test_split(model_x, model_y,
                                    test_size=0.25, random_state = 30)


y_train = np.ravel(y_train)

classifier = LogisticRegression(solver='lbfgs').fit(x_train, y_train) # Specify


print("training score of model: ")
print(round(classifier.score(x_train, y_train), 4), "\n")

print("testing score of model: ")
print(round(classifier.score(x_test, y_test), 4), "\n")


smoker = classifier.predict(x_test)

print("Predictions based on test data: ")
print(smoker)
print("Number predicted to be a smoker: ", sum(smoker))

print(classification_report(y_test, smoker)) # Specify



0    994
1    252
Name: smoker_yes, dtype: int64
training score of model: 
0.9336 

testing score of model: 
0.9167 

Predictions based on test data: 
[0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1
 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1]
Number predicted to be a smoker:  54
              precision    recall  f1-score   support

           0       0.93      0.96      0.95       250
           1       0.83      0.73      0.78        62

    accuracy  

### Create user-report with estimate insurance rate 

In [76]:
#Ask user's age and validate
while True:
    try:
        age_g = int(input("Enter your age: "))
    except ValueError:
        print("I'm sorry,the value you entered was invalid!")
        continue
    else:
        break
#Assgin age_g into age groups
def age_group(self):
    if age_g<=27:
        ageg = 'Gen Z'
        return ageg
    elif age_g>27 and age_g<=45:
        ageg = 'Millennials'
        return ageg
    elif age_g>45 and age_g<=57:
        ageg = 'Gen X'
        return ageg
    else:
        ageg = 'Baby Boomers'
        return ageg        

#Ask user'smoking status and validate
while True:
    smoker = input("Are you a smoker? Please answer with yes or no!: ").lower()
    if smoker not in ['yes', 'no']:
        print("Please type yes or no: ")
        continue
    else:
        break

#Ask user input weight & height and validate
while True:
    try:
        w = float(input("Please enter your weight in lbs (eg: 145.3), separated by a '.' if needed: "))
    except ValueError:
        print("I'm sorry,the value you entered was not valid!")
        continue
    else:
        break

while True:
    try:
        h = float(input("Please enter your height in feet and inches, separated by a '.': "))
    except ValueError:
        print("I'm sorry,the value you entered was not valid!")
        continue
    else:
        break

#calculate BMI
def BMI_Calc(w,h):
    return(float((w/((h*12)**2))*703))

user_BMI = round(BMI_Calc(w,h), 2)

#Assgin user_BMI into BMI categories
def bmi_group(self):
    if user_BMI < 18.5:
        bmic = 'underweight'
        return bmic
    elif user_BMI >=18.5 and user_BMI < 25.0:
        bmic = 'healthy weight'
        return bmic
    elif user_BMI >=25.0 and user_BMI < 30:
        bmic = 'overweight'
        return bmic
    else:
        bmic = 'obesity'
        return bmic        

#categorize the bmi/ age groups into df
#ins_df.bmi = pd.cut(ins_df.bmi, bins=[0, 18.5, 25, 30, 100], include_lowest=True, labels=['underweight', 'healthy weight', 'overweight', 'obesity'])
#ins_df.age = pd.cut(ins_df.age, bins=[0, 27, 45, 57, 60], include_lowest=True, labels=['Gen Z','Millennials', 'Gen X', 'Baby Boomers'])


#Average charges by age group and smoker
df2 = ins_df.groupby(['age', 'smoker','bmi'])["charges"].apply(lambda x : x.astype(int).mean()).reset_index()
df2.charges = df2.charges.round(-3)

#filter out the charges
r1 = df2.loc[df2['age']== age_group(age_g)]
r2 = r1.loc[r1['smoker']== smoker]
r3 = r2.loc[r2['bmi']== bmi_group(user_BMI)]

#print health report
print('\n\nYour self-reported Health Record')
print('Age:          ',age_g,'\nAge Group:    ',age_group(age_g))
print('Smoking:      ',smoker)
print('Height:       ',h,'\nWeight:       ',w,'lbs')
print('BMI:          ',user_BMI,'\nBMI Category: ',bmi_group(user_BMI))

#Print the estimation result
print('\nYour estimate insurance charge is around $',r3.charges.to_string(index=False), "USD." )
print('~Your actual expenses will likely vary.~')



Enter your age: 55
Are you a smoker? Please answer with yes or no!: no
Please enter your weight in lbs (eg: 145.3), separated by a '.' if needed: 199.99
Please enter your height in feet and inches, separated by a '.': 5.55


Your self-reported Health Record
Age:           55 
Age Group:     Gen X
Smoking:       no
Height:        5.55 
Weight:        199.99 lbs
BMI:           31.7 
BMI Category:  obesity

Your estimate insurance charge is around $ 12000.0 USD.
~Your actual expenses will likely vary.~
