# Pandas Exercises

In [1]:
import pandas as pd

path = "diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

Read in csv above to a pandas dataframe

In [None]:
data = pd.read_csv(path, names=names)
print(data.head())
print(f"\nDataframe shape: {data.shape}")

   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1

Dataframe shape: (768, 9)


Print out the age and class of the 100th patient

In [None]:
print(f"100th patient:")
print(f"Age: {data.iloc[99]['age']}")
print(f"Class: {data.iloc[99]['class']}")

100th patient:
Age: 31.0
Class: 1.0


Write code to figure out how many people are classified as not having diabetes, class is 0

In [None]:
no_diabetes = (data['class'] == 0).sum()
print(f"Number of people without diabetes (class = 0): {no_diabetes}")

Number of people without diabetes (class = 0): 500


Write code to figure out how many people are classified as having diabetes, class is 1

In [None]:
has_diabetes = (data['class'] == 1).sum()
print(f"Number of people with diabetes (class = 1): {has_diabetes}")

Number of people with diabetes (class = 1): 268


What percent of people have diabetes in the study

In [None]:
total_people = len(data)
has_diabetes = (data['class'] == 1).sum()
percentage = (has_diabetes / total_people) * 100
print(f"Percentage of people with diabetes: {percentage:.2f}%")

Percentage of people with diabetes: 34.90%


Print out the `"pres"` and `"class"` (no other columns) for the first 5 people

In [None]:
print(data[['pres', 'class']].head())

   pres  class
0    72      1
1    66      0
2    64      1
3    66      0
4    40      1


Run the cell below, it will print out the correlation among the columns

In [8]:
data.corr()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
preg,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
plas,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
pres,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
skin,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
test,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
mass,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
pedi,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
class,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


What column has the highest correlation with class? Find the mean() value of that column

In [None]:
correlations = data.corr()['class'].sort_values(ascending=False)
print("Correlations with 'class':")
print(correlations)
print()

highest_corr_column = correlations.index[1]
print(f"Column with highest correlation to 'class': {highest_corr_column}")
print(f"Mean value of '{highest_corr_column}': {data[highest_corr_column].mean():.2f}")

Correlations with 'class':
class    1.000000
plas     0.466581
mass     0.292695
age      0.238356
preg     0.221898
pedi     0.173844
test     0.130548
skin     0.074752
pres     0.065068
Name: class, dtype: float64

Column with highest correlation to 'class': plas
Mean value of 'plas': 120.89


Create a new dataframe with only people that are above the mean found in the above cell.

In [None]:
mean_value = data[highest_corr_column].mean()
above_mean_df = data[data[highest_corr_column] > mean_value]
print(f"Original dataframe size: {len(data)}")
print(f"Filtered dataframe size (above mean {highest_corr_column}): {len(above_mean_df)}")
print(f"\nFirst few rows:")
print(above_mean_df.head())

Original dataframe size: 768
Filtered dataframe size (above mean plas): 349

First few rows:
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
2     8   183    64     0     0  23.3  0.672   32      1
4     0   137    40    35   168  43.1  2.288   33      1
8     2   197    70    45   543  30.5  0.158   53      1
9     8   125    96     0     0   0.0  0.232   54      1


Create a new dataframe that only includes people who are above the average value for each column in the dataset. In other words, find the average value for each column, and then find which users have values above all of these average column values.

In [None]:
means = data.mean()
print("Mean values for each column:")
print(means)
print()

mask = (data > means).all(axis=1)
above_all_means_df = data[mask]

print(f"Number of people above average in ALL columns: {len(above_all_means_df)}")
print(f"\nThese exceptional individuals:")
print(above_all_means_df)

Mean values for each column:
preg       3.845052
plas     120.894531
pres      69.105469
skin      20.536458
test      79.799479
mass      31.992578
pedi       0.471876
age       33.240885
class      0.348958
dtype: float64

Number of people above average in ALL columns: 16

These exceptional individuals:
     preg  plas  pres  skin  test  mass   pedi  age  class
43      9   171   110    24   240  45.4  0.721   54      1
152     9   156    86    28   155  34.3  1.189   42      1
159    17   163    72    41   114  40.9  0.817   47      1
175     8   179    72    42   130  32.7  0.719   36      1
206     8   196    76    29   280  37.5  0.605   57      1
215    12   151    70    40   271  41.8  0.742   38      1
236     7   181    84    21   192  35.9  0.586   51      1
259    11   155    76    28   150  33.3  1.353   51      1
375    12   140    82    43   325  39.2  0.528   58      1
424     8   151    78    32   210  42.9  0.516   36      1
458    10   148    84    48   237  37.6  1.0

*Note: the next 3 problems were not explicitly shown in our tutorial—being able to effectively google and look through documentation is a very important coding tool!* ¯\\\_(ツ)_/¯

Ignoring the index (resetting the index to 0) and using the dataframe from the cell above, sort the values based on `"skin"`, `"test"`, and `"pedi"` values in ascending order. Please write this in one line.

In [None]:
sorted_df = above_all_means_df.sort_values(by=['skin', 'test', 'pedi'], ascending=True).reset_index(drop=True)
print(sorted_df)

    preg  plas  pres  skin  test  mass   pedi  age  class
0      7   181    84    21   192  35.9  0.586   51      1
1      9   171   110    24   240  45.4  0.721   54      1
2     11   138    74    26   144  36.1  0.557   50      1
3      5   187    76    27   207  43.6  1.034   53      1
4     11   155    76    28   150  33.3  1.353   51      1
5      9   156    86    28   155  34.3  1.189   42      1
6      7   150    78    29   126  35.2  0.692   54      1
7      8   196    76    29   280  37.5  0.605   57      1
8      8   151    78    32   210  42.9  0.516   36      1
9     12   151    70    40   271  41.8  0.742   38      1
10    17   163    72    41   114  40.9  0.817   47      1
11     8   179    72    42   130  32.7  0.719   36      1
12     7   168    88    42   321  38.2  0.787   40      1
13    12   140    82    43   325  39.2  0.528   58      1
14     9   145    80    46   130  37.9  0.637   40      1
15    10   148    84    48   237  37.6  1.001   51      1


Now let's find the 4 largest values for `"age"`

In [None]:
largest_ages = data.nlargest(4, 'age')
print("4 largest values for 'age':")
print(largest_ages)

4 largest values for 'age':
     preg  plas  pres  skin  test  mass   pedi  age  class
459     9   134    74    33    60  25.9  0.460   81      0
453     2   119     0     0     0  19.6  0.832   72      0
666     4   145    82    18     0  32.5  0.235   70      1
123     5   132    80     0     0  26.8  0.186   69      0


Now find the smallest 4 values for `"pedi"` and `"plas"` in reverse order (smallest to largest).

In [None]:
smallest_values = data.nsmallest(4, ['pedi', 'plas'], keep='first')
print("4 smallest values for 'pedi' and 'plas' (in reverse order):")
print(smallest_values)

4 smallest values for 'pedi' and 'plas' (in reverse order):
     preg  plas  pres  skin  test  mass   pedi  age  class
268     0   102    52     0     0  25.1  0.078   21      0
180     6    87    80     0     0  23.2  0.084   32      0
149     2    90    70    17     0  27.3  0.085   22      0
567     6    92    62    32   126  32.0  0.085   46      0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f3d2e979-42f4-4fa4-a66a-116a1c2662d5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>