In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [2]:
filename = ('Data/insurance - insurance (1).csv')
df = pd.read_csv(filename)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Insurance Charges

**Questions:**

1. State Null and Alternative Hypothesis
**Null:** There is no difference in insurance rates between smokers and non smokers

**Alternate:** Smokers had higher insurance rates than non smokers

2. Select the correct test: 
T- test; comparing a numerical feature between two groups. 

In [3]:
#smoker_charges = df.loc[df['smoker'] == 'yes', 'charges']
#nsmoker_charges = df.loc[df['smoker'] == 'no', 'charges']


In [4]:
## Getting means 
#print(f' For smokers (n={len(smoker_charges)}): Mean = {np.mean(smoker_charges):.2f}')
#print(f' For non smokers (n = {len(nsmoker_charges)}): Mean = {np.mean(nsmoker_charges):.2f}')

In [5]:
df['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [6]:
df_smoker = df.loc[df['smoker'] == 'yes'].copy()
df_nsmoker = df.loc[df['smoker'] == 'no'].copy()

In [7]:
smoker_charges = df_smoker['charges']
nsmoker_charges = df_nsmoker['charges']

In [8]:
## Testing each group for outliers
## None present in smokers
zscores_s = stats.zscore(smoker_charges)
outliers_s = abs(zscores_s) >3
np.sum(outliers_s)

0

In [9]:
zscores_ns = stats.zscore(nsmoker_charges)
outliers_ns = abs(zscores_ns) >3
np.sum(outliers_ns)

24

In [10]:
## Remove outliers from non smokers group
nsmoker_charges = nsmoker_charges[(np.abs(stats.zscore(nsmoker_charges)) <3)]

In [11]:
## Testing smokers for normality
## Much smaller than 0.05 but the sample is large enough; not normal dist

result_smoker = stats.normaltest(smoker_charges)
result_smoker

NormaltestResult(statistic=61.03941356533816, pvalue=5.564930630036463e-14)

In [12]:
# Testing non smoker for normality
## Much smaller p value than 0.05 but sample is large enough; not normal dist
result_nsmoker = stats.normaltest(nsmoker_charges)
result_nsmoker

NormaltestResult(statistic=163.80367047789198, pvalue=2.6945416315543976e-36)

In [14]:
## Testing for equal variance
#P value is much smaller than 0.05; samples do not come from populations 
## with equal variance. 
result = stats.levene(smoker_charges, nsmoker_charges)
result


LeveneResult(statistic=520.7468821724297, pvalue=2.4247238784347824e-97)

In [15]:
# Independent t-test with equal_var set to False
## Reject the null and accept there is significant difference in charges. Can accept its higher??
t_result = stats.ttest_ind(smoker_charges, nsmoker_charges, equal_var = False)
t_result

Ttest_indResult(statistic=33.732305987092516, pvalue=2.5753226625873578e-104)