In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## **General Information**

In [5]:
df.shape

(1338, 7)

In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
# Print the count of non-empty values for each feature
print('age: ' + str(df.age.count()))
print('sex: ' + str(df.sex.count()))
print('bmi: ' + str(df.bmi.count()))
print('children: ' + str(df.children.count()))
print('smoker: ' + str(df.smoker.count()))
print('region: ' + str(df.region.count()))
print('charges: ' + str(df.charges.count()))

print('\n')

# Print the number of unique values for each feature
print('age: ' + str(df.age.nunique()))
print('sex: ' + str(df.sex.nunique()))
print('bmi: ' + str(df.bmi.nunique()))
print('children: ' + str(df.children.nunique()))
print('smoker: ' + str(df.smoker.nunique()))
print('region: ' + str(df.region.nunique()))
print('charges: ' + str(df.charges.nunique()))

age: 1338
sex: 1338
bmi: 1338
children: 1338
smoker: 1338
region: 1338
charges: 1338


age: 47
sex: 2
bmi: 548
children: 6
smoker: 2
region: 4
charges: 1337


In [8]:
print(df.nunique())
print()
print(df.dtypes)

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


In [9]:
print('age: ' + str(df.age.dtype))
print('sex: ' + str(df.sex.dtype))
print('bmi: ' + str(df.bmi.dtype))
print('children: ' + str(df.children.dtype))
print('smoker: ' + str(df.smoker.dtype))
print('region: ' + str(df.region.dtype))
print('charges: ' + str(df.charges.dtype))

age: int64
sex: object
bmi: float64
children: int64
smoker: object
region: object
charges: float64


In [10]:
print('age: ' + str(pd.api.types.is_numeric_dtype(df.age)))
print('sex: ' + str(pd.api.types.is_numeric_dtype(df.sex)))
print('bmi: ' + str(pd.api.types.is_numeric_dtype(df.bmi)))
print('children: ' + str(pd.api.types.is_numeric_dtype(df.children)))
print('smoker: ' + str(pd.api.types.is_numeric_dtype(df.smoker)))
print('region: ' + str(pd.api.types.is_numeric_dtype(df.region)))
print('charges: ' + str(pd.api.types.is_numeric_dtype(df.charges)))

age: True
sex: False
bmi: True
children: True
smoker: False
region: False
charges: True


In [11]:
print('age: ' + str(df.age.isna().sum()))
print('sex: ' + str(df.sex.isna().sum()))
print('bmi: ' + str(df.bmi.isna().sum()))
print('children: ' + str(df.children.isna().sum()))
print('smoker: ' + str(df.smoker.isna().sum()))
print('region: ' + str(df.region.isna().sum()))
print('charges: ' + str(df.charges.isna().sum()))

age: 0
sex: 0
bmi: 0
children: 0
smoker: 0
region: 0
charges: 0


## **Boundaries and Middle**

______________________________________________________________________________

In [12]:
myList = [1,2,3,4,5,6,7,8,9,10]
print(max(myList))
print(min(myList))

10
1


In [13]:
myList = [1,2,3,4,5,6,7,8,9,10]
print(np.quantile(myList, .25))
print(np.quantile(myList, .50))
print(np.quantile(myList, .75))

3.25
5.5
7.75


In [14]:
df = pd.DataFrame(data=[1,2,3,4,5,6,7,8,9,10], columns=['myList'])
print("\n")
print(df.myList.quantile(.25))
print(df.myList.quantile(.50))
print(df.myList.quantile(.75))



3.25
5.5
7.75


In [15]:
import statistics as stat

myList = [1,2,3,4,5,6,7,8,9,10]
stat.mean(myList)

5.5

In [16]:
# First we create a dataframe to test this on. Notice that we first add 
# the data, then specify a series of column headers and row index names
# (as opposed to index numbers)
df = pd.DataFrame([[10, 20, 30, 40], [7, 14, 21, 28], [55, 15, 8, 12],
[15, 14, 1, 8], [7, 1, 1, 8], [5, 4, 9, 2]],
columns=['Apple', 'Orange', 'Banana', 'Pear'],
index=['Basket1', 'Basket2', 'Basket3', 'Basket4',
'Basket5', 'Basket6'])

# Now print it to see what it looks like
print(df)

         Apple  Orange  Banana  Pear
Basket1     10      20      30    40
Basket2      7      14      21    28
Basket3     55      15       8    12
Basket4     15      14       1     8
Basket5      7       1       1     8
Basket6      5       4       9     2


In [17]:
# Now we can use pandas built in mean() function to see the mean for each column
print(df.mean())

Apple     16.500000
Orange    11.333333
Banana    11.666667
Pear      16.333333
dtype: float64


In [18]:
df.mean(axis="columns")
# OR df.mean(axis=1)

Basket1    25.00
Basket2    17.50
Basket3    22.50
Basket4     9.50
Basket5     4.25
Basket6     5.00
dtype: float64

In [19]:
# Using the same dataset as above
df.median() # or df.median(axis="columns") if you want to view median by columns

Apple      8.5
Orange    14.0
Banana     8.5
Pear      10.0
dtype: float64

In [20]:
# Using the same dataset as above
df.mode() #Or df.mode(axis="columns") if you want to find the mode by columns instead

Unnamed: 0,Apple,Orange,Banana,Pear
0,7,14,1,8


________________________________________________________________________________

In [21]:
df = pd.read_csv('insurance.csv')

In [22]:
print(df.charges.count())
print(df.charges.min())
print(df.charges.max())
print(df.charges.quantile(.25))
print(df.charges.quantile(.50))
print(df.charges.quantile(.75))
print(df.charges.mean())
print(df.charges.median())
print(df.charges.mode())

1338
1121.8739
63770.42801
4740.28715
9382.033
16639.912515
13270.422265141257
9382.033
0    1639.5631
Name: charges, dtype: float64


## **Standard Deviation**

________________________________________________________________________________

In [23]:
myList = [1,2,3,4,5,6,7,8,9,10]
print(np.std(myList, ddof=1)) # The parameter 'ddof=1' is used to change the default std to sample mode (s)

3.0276503540974917


In [24]:
df = pd.DataFrame(data=[1,2,3,4,5,6,7,8,9,10], columns=['numbers'])
print(df.numbers.std())       # Assumes a sample std (s) by default

3.0276503540974917


__________________________________________________________________________

In [25]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [26]:
print(df.charges.std())
print(df['charges'].std())
print("")
print(df['age'].std())
print(df['bmi'].std())
print(df['children'].std())

12110.011236694001
12110.011236694001

14.049960379216154
6.098186911679014
1.205492739781914


## **Normality: Skew, Kurt**

In [27]:
# Using Python list
from scipy.stats import kurtosis, skew
myList = [1,2,2,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,7,7,7,7,7,8,8,8,9,9,10]
print(skew(myList, bias=False))     
print(kurtosis(myList, bias=False))

-0.01972922271337009
-0.03905580479600701


In [28]:
# Using Pandas DataFrame
df = pd.DataFrame(data=[1,2,2,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,7,7,7,7,7,8,8,8,9,9,10], columns=['numbers'])
print(df.numbers.skew())
print(df.numbers.kurt())

-0.01972922271337009
-0.0390558047960079


In [29]:
df = pd.read_csv('insurance.csv')

print(df.charges.skew())
print(df['charges'].skew())
print(df[['charges', 'bmi', 'children', 'age']].skew())
print()
print(df.charges.kurt())
print(df['charges'].kurt())
print(df[['charges', 'bmi', 'children', 'age']].kurt())

1.5158796580240388
1.5158796580240388
charges     1.515880
bmi         0.284047
children    0.938380
age         0.055673
dtype: float64

1.6062986532967907
1.6062986532967907
charges     1.606299
bmi        -0.050732
children    0.202454
age        -1.245088
dtype: float64


## **Practice**

## Practice 1: General Information

In [35]:
df = pd.read_csv('heart_attack_prediction_dataset.csv')

print(df.shape)
print()
print(df.nunique())
print()

(8763, 26)

Patient ID                         8763
Age                                  73
Sex                                   2
Cholesterol                         281
Blood Pressure                     3915
Heart Rate                           71
Diabetes                              2
Family History                        2
Smoking                               2
Obesity                               2
Alcohol Consumption                   2
Exercise Hours Per Week            8763
Diet                                  3
Previous Heart Problems               2
Medication Use                        2
Stress Level                         10
Sedentary Hours Per Day            8763
Income                             8615
BMI                                8763
Triglycerides                       771
Physical Activity Days Per Week       8
Sleep Hours Per Day                   7
Country                              20
Continent                             6
Hemisphere                  

In [36]:
print(df.dtypes)

Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

## Practice 2: Boundaries, Middle, Spread, Normality

Continue your analysis by calculating the range and middle of each feature and finding the following:

Minimum, 25% quantile, median, 75% quantile, max
Mean, mode, standard deviation
Skewness and kurtosis

In [38]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].min())

Age                                   18.000000
Cholesterol                          120.000000
Heart Rate                            40.000000
Diabetes                               0.000000
Family History                         0.000000
Smoking                                0.000000
Obesity                                0.000000
Alcohol Consumption                    0.000000
Exercise Hours Per Week                0.002442
Previous Heart Problems                0.000000
Medication Use                         0.000000
Stress Level                           1.000000
Sedentary Hours Per Day                0.001263
Income                             20062.000000
BMI                                   18.002337
Triglycerides                         30.000000
Physical Activity Days Per Week        0.000000
Sleep Hours Per Day                    4.000000
Heart Attack Risk                      0.000000
dtype: float64


In [39]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].quantile(.25))


Age                                   35.000000
Cholesterol                          192.000000
Heart Rate                            57.000000
Diabetes                               0.000000
Family History                         0.000000
Smoking                                1.000000
Obesity                                0.000000
Alcohol Consumption                    0.000000
Exercise Hours Per Week                4.981579
Previous Heart Problems                0.000000
Medication Use                         0.000000
Stress Level                           3.000000
Sedentary Hours Per Day                2.998794
Income                             88310.000000
BMI                                   23.422985
Triglycerides                        225.500000
Physical Activity Days Per Week        2.000000
Sleep Hours Per Day                    5.000000
Heart Attack Risk                      0.000000
Name: 0.25, dtype: float64


In [40]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].median())


Age                                    54.000000
Cholesterol                           259.000000
Heart Rate                             75.000000
Diabetes                                1.000000
Family History                          0.000000
Smoking                                 1.000000
Obesity                                 1.000000
Alcohol Consumption                     1.000000
Exercise Hours Per Week                10.069559
Previous Heart Problems                 0.000000
Medication Use                          0.000000
Stress Level                            5.000000
Sedentary Hours Per Day                 5.933622
Income                             157866.000000
BMI                                    28.768999
Triglycerides                         417.000000
Physical Activity Days Per Week         3.000000
Sleep Hours Per Day                     7.000000
Heart Attack Risk                       0.000000
dtype: float64


In [41]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].quantile(.75))


Age                                    72.000000
Cholesterol                           330.000000
Heart Rate                             93.000000
Diabetes                                1.000000
Family History                          1.000000
Smoking                                 1.000000
Obesity                                 1.000000
Alcohol Consumption                     1.000000
Exercise Hours Per Week                15.050018
Previous Heart Problems                 1.000000
Medication Use                          1.000000
Stress Level                            8.000000
Sedentary Hours Per Day                 9.019124
Income                             227749.000000
BMI                                    34.324594
Triglycerides                         612.000000
Physical Activity Days Per Week         5.000000
Sleep Hours Per Day                     9.000000
Heart Attack Risk                       1.000000
Name: 0.75, dtype: float64


In [42]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].max())


Age                                    90.000000
Cholesterol                           400.000000
Heart Rate                            110.000000
Diabetes                                1.000000
Family History                          1.000000
Smoking                                 1.000000
Obesity                                 1.000000
Alcohol Consumption                     1.000000
Exercise Hours Per Week                19.998709
Previous Heart Problems                 1.000000
Medication Use                          1.000000
Stress Level                           10.000000
Sedentary Hours Per Day                11.999313
Income                             299954.000000
BMI                                    39.997211
Triglycerides                         800.000000
Physical Activity Days Per Week         7.000000
Sleep Hours Per Day                    10.000000
Heart Attack Risk                       1.000000
dtype: float64


In [43]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].mean())


Age                                    53.707977
Cholesterol                           259.877211
Heart Rate                             75.021682
Diabetes                                0.652288
Family History                          0.492982
Smoking                                 0.896839
Obesity                                 0.501426
Alcohol Consumption                     0.598083
Exercise Hours Per Week                10.014284
Previous Heart Problems                 0.495835
Medication Use                          0.498345
Stress Level                            5.469702
Sedentary Hours Per Day                 5.993690
Income                             158263.181901
BMI                                    28.891446
Triglycerides                         417.677051
Physical Activity Days Per Week         3.489672
Sleep Hours Per Day                     7.023508
Heart Attack Risk                       0.358211
dtype: float64


In [46]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].mode().values[0])


[9.00000000e+01 2.35000000e+02 9.40000000e+01 1.00000000e+00
 0.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 2.44234832e-03 0.00000000e+00 0.00000000e+00 2.00000000e+00
 1.26320578e-03 2.25278000e+05 1.80023366e+01 7.99000000e+02
 3.00000000e+00 1.00000000e+01 0.00000000e+00]


In [47]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].std())


Age                                   21.249509
Cholesterol                           80.863276
Heart Rate                            20.550948
Diabetes                               0.476271
Family History                         0.499979
Smoking                                0.304186
Obesity                                0.500026
Alcohol Consumption                    0.490313
Exercise Hours Per Week                5.783745
Previous Heart Problems                0.500011
Medication Use                         0.500026
Stress Level                           2.859622
Sedentary Hours Per Day                3.466359
Income                             80575.190806
BMI                                    6.319181
Triglycerides                        223.748137
Physical Activity Days Per Week        2.282687
Sleep Hours Per Day                    1.988473
Heart Attack Risk                      0.479502
dtype: float64


In [48]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].skew())


Age                                0.028498
Cholesterol                       -0.000955
Heart Rate                        -0.003227
Diabetes                          -0.639647
Family History                     0.028080
Smoking                           -2.609778
Obesity                           -0.005707
Alcohol Consumption               -0.400174
Exercise Hours Per Week           -0.016387
Previous Heart Problems            0.016664
Medication Use                     0.006620
Stress Level                       0.008389
Sedentary Hours Per Day            0.017974
Income                             0.021792
BMI                                0.035996
Triglycerides                     -0.001915
Physical Activity Days Per Week    0.017822
Sleep Hours Per Day                0.000357
Heart Attack Risk                  0.591538
dtype: float64


In [49]:
print(df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']].kurt())

Age                               -1.213755
Cholesterol                       -1.180246
Heart Rate                        -1.211180
Diabetes                          -1.591215
Family History                    -1.999668
Smoking                            4.812041
Obesity                           -2.000424
Alcohol Consumption               -1.840281
Exercise Hours Per Week           -1.203342
Previous Heart Problems           -2.000179
Medication Use                    -2.000413
Stress Level                      -1.225439
Sedentary Hours Per Day           -1.193479
Income                            -1.181923
BMI                               -1.187977
Triglycerides                     -1.197800
Physical Activity Days Per Week   -1.229552
Sleep Hours Per Day               -1.232354
Heart Attack Risk                 -1.650460
dtype: float64


## Practice 3: DataFrame

Finally, we should place all of this information into a single DataFrame and summarize it and make it look nice.

Create a new DataFrame called 'ha_df'
Add 11 columns in the DataFrame for each of the columns in the original dataset (age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, num)
Create 13 rows in this DataFrame by adding one row that calculates each of the metrics listed in the above tasks except for the count of columns (count of rows, unique values, data type, min, 25%, median, 75%, max, mean, mode, std dev, skew, kurt) for each of the 11 features. Use the name of the feature as the row index label
HINT: You will save yourself a lot of time if you calculate those metrics for all columns of the original dataset in one line (see Not Found for a reminder of how to do this)
HINT: Rather than write 13 x 11 (143) lines of code to calculate each metric for each variable, consider calculating a single metric for all columns (1 x 11) so that you only need 11 lines of code. See the DataFrames chapter on Modifying DataFrames -> Add Rows for an example. However, note that you may need to cast the DataFrame Series of metrics into a native Python list. For example: list(df.count())
Print the DataFrame to verify it looks like this:

In [53]:
df = df[['Age', 
          'Cholesterol', 
          'Heart Rate', 
          'Diabetes', 
          'Family History', 
          'Smoking', 
          'Obesity', 
          'Alcohol Consumption', 
          'Exercise Hours Per Week',  
          'Previous Heart Problems', 
          'Medication Use', 
          'Stress Level', 
          'Sedentary Hours Per Day', 
          'Income', 
          'BMI', 
          'Triglycerides', 
          'Physical Activity Days Per Week', 
          'Sleep Hours Per Day',
          'Heart Attack Risk']]
df.head()
ha_df = pd.DataFrame(columns=[df.columns], index=['Count', 'Unique', 'Type', 'Min', 'Median', 'Mode', 'Mean', 'Std', 'Skew', 'Kurt'])
ha_df.loc['Type'] = list(df.dtypes)
ha_df.loc['Count'] = list(df.count())
ha_df.loc['Unique'] = list(df.nunique())
ha_df.loc['Mode'] = list(df.mode().values[0])
ha_df.loc['Min'] = list(df.min())
ha_df.loc['25%'] = list(df.quantile(.25))
ha_df.loc['Median'] = list(df.median())
ha_df.loc['75%'] = list(df.quantile(.75))
ha_df.loc['Max'] = list(df.max())
ha_df.loc['Mean'] = list(df.mean())
ha_df.loc['Std'] = list(df.std())
ha_df.loc['Skew'] = list(df.skew())
ha_df.loc['Kurt'] = list(df.kurt())
ha_df

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk
Count,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763,8763
Unique,73,281,71,2,2,2,2,2,8763,2,2,10,8763,8615,8763,771,8,7,2
Type,int64,int64,int64,int64,int64,int64,int64,int64,float64,int64,int64,int64,float64,int64,float64,int64,int64,int64,int64
Min,18.0,120.0,40.0,0.0,0.0,0.0,0.0,0.0,0.002442,0.0,0.0,1.0,0.001263,20062.0,18.002337,30.0,0.0,4.0,0.0
Median,54.0,259.0,75.0,1.0,0.0,1.0,1.0,1.0,10.069559,0.0,0.0,5.0,5.933622,157866.0,28.768999,417.0,3.0,7.0,0.0
Mode,90.0,235.0,94.0,1.0,0.0,1.0,1.0,1.0,0.002442,0.0,0.0,2.0,0.001263,225278.0,18.002337,799.0,3.0,10.0,0.0
Mean,53.707977,259.877211,75.021682,0.652288,0.492982,0.896839,0.501426,0.598083,10.014284,0.495835,0.498345,5.469702,5.99369,158263.181901,28.891446,417.677051,3.489672,7.023508,0.358211
Std,21.249509,80.863276,20.550948,0.476271,0.499979,0.304186,0.500026,0.490313,5.783745,0.500011,0.500026,2.859622,3.466359,80575.190806,6.319181,223.748137,2.282687,1.988473,0.479502
Skew,0.028498,-0.000955,-0.003227,-0.639647,0.02808,-2.609778,-0.005707,-0.400174,-0.016387,0.016664,0.00662,0.008389,0.017974,0.021792,0.035996,-0.001915,0.017822,0.000357,0.591538
Kurt,-1.213755,-1.180246,-1.21118,-1.591215,-1.999668,4.812041,-2.000424,-1.840281,-1.203342,-2.000179,-2.000413,-1.225439,-1.193479,-1.181923,-1.187977,-1.1978,-1.229552,-1.232354,-1.65046


Create an inverted version of the same matrix table you just created that looks like this:

In [54]:
ha_df = pd.DataFrame(columns=['Count', 'Unique', 'Type', 'Min', 'Median', 'Mode', 'Mean', 'Std', 'Skew', 'Kurt'], index=[df.columns])
ha_df['Type'] = list(df.dtypes)
ha_df['Count'] = list(df.count())
ha_df['Unique'] = list(df.nunique())
ha_df['Mode'] = list(df.mode().values[0])
ha_df['Min'] = list(df.min())
ha_df['25%'] = list(df.quantile(.25))
ha_df['Median'] = list(df.median())
ha_df['75%'] = list(df.quantile(.75))
ha_df['Max'] = list(df.max())
ha_df['Mean'] = list(df.mean())
ha_df['Std'] = list(df.std())
ha_df['Skew'] = list(df.skew())
ha_df['Kurt'] = list(df.kurt())
ha_df

Unnamed: 0,Count,Unique,Type,Min,Median,Mode,Mean,Std,Skew,Kurt,25%,75%,Max
Age,8763,73,int64,18.0,54.0,90.0,53.707977,21.249509,0.028498,-1.213755,35.0,72.0,90.0
Cholesterol,8763,281,int64,120.0,259.0,235.0,259.877211,80.863276,-0.000955,-1.180246,192.0,330.0,400.0
Heart Rate,8763,71,int64,40.0,75.0,94.0,75.021682,20.550948,-0.003227,-1.21118,57.0,93.0,110.0
Diabetes,8763,2,int64,0.0,1.0,1.0,0.652288,0.476271,-0.639647,-1.591215,0.0,1.0,1.0
Family History,8763,2,int64,0.0,0.0,0.0,0.492982,0.499979,0.02808,-1.999668,0.0,1.0,1.0
Smoking,8763,2,int64,0.0,1.0,1.0,0.896839,0.304186,-2.609778,4.812041,1.0,1.0,1.0
Obesity,8763,2,int64,0.0,1.0,1.0,0.501426,0.500026,-0.005707,-2.000424,0.0,1.0,1.0
Alcohol Consumption,8763,2,int64,0.0,1.0,1.0,0.598083,0.490313,-0.400174,-1.840281,0.0,1.0,1.0
Exercise Hours Per Week,8763,8763,float64,0.002442,10.069559,0.002442,10.014284,5.783745,-0.016387,-1.203342,4.981579,15.050018,19.998709
Previous Heart Problems,8763,2,int64,0.0,0.0,0.0,0.495835,0.500011,0.016664,-2.000179,0.0,1.0,1.0


Sort the inverted DataFrame based on the Mean column:

In [55]:
ha_df.sort_values(by=['Mean'])

Unnamed: 0,Count,Unique,Type,Min,Median,Mode,Mean,Std,Skew,Kurt,25%,75%,Max
Heart Attack Risk,8763,2,int64,0.0,0.0,0.0,0.358211,0.479502,0.591538,-1.65046,0.0,1.0,1.0
Family History,8763,2,int64,0.0,0.0,0.0,0.492982,0.499979,0.02808,-1.999668,0.0,1.0,1.0
Previous Heart Problems,8763,2,int64,0.0,0.0,0.0,0.495835,0.500011,0.016664,-2.000179,0.0,1.0,1.0
Medication Use,8763,2,int64,0.0,0.0,0.0,0.498345,0.500026,0.00662,-2.000413,0.0,1.0,1.0
Obesity,8763,2,int64,0.0,1.0,1.0,0.501426,0.500026,-0.005707,-2.000424,0.0,1.0,1.0
Alcohol Consumption,8763,2,int64,0.0,1.0,1.0,0.598083,0.490313,-0.400174,-1.840281,0.0,1.0,1.0
Diabetes,8763,2,int64,0.0,1.0,1.0,0.652288,0.476271,-0.639647,-1.591215,0.0,1.0,1.0
Smoking,8763,2,int64,0.0,1.0,1.0,0.896839,0.304186,-2.609778,4.812041,1.0,1.0,1.0
Physical Activity Days Per Week,8763,8,int64,0.0,3.0,3.0,3.489672,2.282687,0.017822,-1.229552,2.0,5.0,7.0
Stress Level,8763,10,int64,1.0,5.0,2.0,5.469702,2.859622,0.008389,-1.225439,3.0,8.0,10.0


Sort the inverted DataFrame based on the Skew column descending:

In [56]:
ha_df.sort_values(by=['Skew'], ascending=False)

Unnamed: 0,Count,Unique,Type,Min,Median,Mode,Mean,Std,Skew,Kurt,25%,75%,Max
Heart Attack Risk,8763,2,int64,0.0,0.0,0.0,0.358211,0.479502,0.591538,-1.65046,0.0,1.0,1.0
BMI,8763,8763,float64,18.002337,28.768999,18.002337,28.891446,6.319181,0.035996,-1.187977,23.422985,34.324594,39.997211
Age,8763,73,int64,18.0,54.0,90.0,53.707977,21.249509,0.028498,-1.213755,35.0,72.0,90.0
Family History,8763,2,int64,0.0,0.0,0.0,0.492982,0.499979,0.02808,-1.999668,0.0,1.0,1.0
Income,8763,8615,int64,20062.0,157866.0,225278.0,158263.181901,80575.190806,0.021792,-1.181923,88310.0,227749.0,299954.0
Sedentary Hours Per Day,8763,8763,float64,0.001263,5.933622,0.001263,5.99369,3.466359,0.017974,-1.193479,2.998794,9.019124,11.999313
Physical Activity Days Per Week,8763,8,int64,0.0,3.0,3.0,3.489672,2.282687,0.017822,-1.229552,2.0,5.0,7.0
Previous Heart Problems,8763,2,int64,0.0,0.0,0.0,0.495835,0.500011,0.016664,-2.000179,0.0,1.0,1.0
Stress Level,8763,10,int64,1.0,5.0,2.0,5.469702,2.859622,0.008389,-1.225439,3.0,8.0,10.0
Medication Use,8763,2,int64,0.0,0.0,0.0,0.498345,0.500026,0.00662,-2.000413,0.0,1.0,1.0


Which columns have a problem with skewness? (ANSWER: fbs, restecg, oldpeak, chol, and sex are all outside of the bounds. However, we don’t worry about sex because it is a special case in which the values are binary: 0 or 1)

Create and call a function that allows a user to sort the DataFrame by as many levels as they want to. For example, the following version is sorted first by "Type" and then by "Unique":

In [57]:
# Create the function
def sort_ha_df(column_list):
  return ha_df.sort_values(by=column_list)

# Call the function
sort_ha_df(['Type', 'Unique'])

Unnamed: 0,Count,Unique,Type,Min,Median,Mode,Mean,Std,Skew,Kurt,25%,75%,Max
Diabetes,8763,2,int64,0.0,1.0,1.0,0.652288,0.476271,-0.639647,-1.591215,0.0,1.0,1.0
Family History,8763,2,int64,0.0,0.0,0.0,0.492982,0.499979,0.02808,-1.999668,0.0,1.0,1.0
Smoking,8763,2,int64,0.0,1.0,1.0,0.896839,0.304186,-2.609778,4.812041,1.0,1.0,1.0
Obesity,8763,2,int64,0.0,1.0,1.0,0.501426,0.500026,-0.005707,-2.000424,0.0,1.0,1.0
Alcohol Consumption,8763,2,int64,0.0,1.0,1.0,0.598083,0.490313,-0.400174,-1.840281,0.0,1.0,1.0
Previous Heart Problems,8763,2,int64,0.0,0.0,0.0,0.495835,0.500011,0.016664,-2.000179,0.0,1.0,1.0
Medication Use,8763,2,int64,0.0,0.0,0.0,0.498345,0.500026,0.00662,-2.000413,0.0,1.0,1.0
Heart Attack Risk,8763,2,int64,0.0,0.0,0.0,0.358211,0.479502,0.591538,-1.65046,0.0,1.0,1.0
Sleep Hours Per Day,8763,7,int64,4.0,7.0,10.0,7.023508,1.988473,0.000357,-1.232354,5.0,9.0,10.0
Physical Activity Days Per Week,8763,8,int64,0.0,3.0,3.0,3.489672,2.282687,0.017822,-1.229552,2.0,5.0,7.0


Modify that function to allow the user to choose whether they want to sort ascending or descending:

In [58]:
# Create the function
def sort_ha_df(column_list, direction):
  if direction == 'asc':
    return ha_df.sort_values(by=column_list, ascending=True)
  else:
    return ha_df.sort_values(by=column_list, ascending=False)

# Call the function
# Enter 'asc' for ascending and 'desc' (or anything else) for descending
sort_ha_df(['Type', 'Unique'], 'desc')

Unnamed: 0,Count,Unique,Type,Min,Median,Mode,Mean,Std,Skew,Kurt,25%,75%,Max
Exercise Hours Per Week,8763,8763,float64,0.002442,10.069559,0.002442,10.014284,5.783745,-0.016387,-1.203342,4.981579,15.050018,19.998709
Sedentary Hours Per Day,8763,8763,float64,0.001263,5.933622,0.001263,5.99369,3.466359,0.017974,-1.193479,2.998794,9.019124,11.999313
BMI,8763,8763,float64,18.002337,28.768999,18.002337,28.891446,6.319181,0.035996,-1.187977,23.422985,34.324594,39.997211
Income,8763,8615,int64,20062.0,157866.0,225278.0,158263.181901,80575.190806,0.021792,-1.181923,88310.0,227749.0,299954.0
Triglycerides,8763,771,int64,30.0,417.0,799.0,417.677051,223.748137,-0.001915,-1.1978,225.5,612.0,800.0
Cholesterol,8763,281,int64,120.0,259.0,235.0,259.877211,80.863276,-0.000955,-1.180246,192.0,330.0,400.0
Age,8763,73,int64,18.0,54.0,90.0,53.707977,21.249509,0.028498,-1.213755,35.0,72.0,90.0
Heart Rate,8763,71,int64,40.0,75.0,94.0,75.021682,20.550948,-0.003227,-1.21118,57.0,93.0,110.0
Stress Level,8763,10,int64,1.0,5.0,2.0,5.469702,2.859622,0.008389,-1.225439,3.0,8.0,10.0
Physical Activity Days Per Week,8763,8,int64,0.0,3.0,3.0,3.489672,2.282687,0.017822,-1.229552,2.0,5.0,7.0
