# Pima Data cleaning



In [2]:
# Import Modules Here
import pandas as pd
from attrs.converters import to_bool

## Part 1: Read the Data into Jupyter

In [4]:
# Read the data into a dataframe named pima_df.
df = pd.read_csv("../../Data/diabetes/diabetes_uncleaned_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               890 non-null    float64
 1   Glucose                   988 non-null    float64
 2   BloodPressure             982 non-null    object 
 3   SkinThickness             1004 non-null   int64  
 4   Insulin                   1004 non-null   int64  
 5   BMI                       1003 non-null   object 
 6   DiabetesPedigreeFunction  1004 non-null   float64
 7   Age                       982 non-null    float64
 8   Outcome                   1004 non-null   object 
dtypes: float64(4), int64(2), object(3)
memory usage: 70.7+ KB


## Part 2: Missing Data

In [6]:
#Question 1:  Find Columns with missing data
df.isna().sum()

Pregnancies                 114
Glucose                      16
BloodPressure                22
SkinThickness                 0
Insulin                       0
BMI                           1
DiabetesPedigreeFunction      0
Age                          22
Outcome                       0
dtype: int64

In [7]:
# Question 2:  Write code to remove rows with missing data and save the new dataframe as pima_missing_fixed_df
pima_missing_fixed_df = df.dropna()

In [8]:
# Question 3:  Print out the number of rows in the dataframe.
len(pima_missing_fixed_df)

840

### Question 4:  Discuss why missing data could be a problem.  What other methods could you use in this situation?

Removing a full row of data due to one missing value is fairly obvious in how you lose more data than n/a values. In the event that there's a common column of N/A's, it's better to drop that column entirely first.

Otherwise, instead of dropping, fill the n/a's with a mean or other calculated value that wouldn't induce too much error/bias.

## Part 3: Duplicated Data

In [11]:
# Question 1: find the rows of duplicated data in the
# pima_missing_fixed_df and print them.
pima_missing_fixed_df[pima_missing_fixed_df.duplicated()]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
925,6.0,39.0,46,2,287,40.4,2.394,60.0,True
926,10.0,67.0,39,12,168,53.4,0.162,71.0,False
999,6.0,110.0,26,27,590,20.8,1.185,95.0,False
1003,6.0,151.0,60,26,362,43.4,2.027,23.0,False


In [12]:
# Question 2: Write code to remove duplicate rows
# and save the new dataframe as pima_missing_fixed_df
pima_missing_fixed_df = pima_missing_fixed_df.drop_duplicates()

In [13]:
# Question 3:  Print out the number of rows in the dataframe.
len(pima_missing_fixed_df)

836

### Question 4:  In your jupyter notebook, discuss why duplicate data could be a problem.
### What other methods could you use in this situation?

Duplicate data can cause issues by giving more weight to one instance of data than it has in reality. If something that happens once is recorded three times, it still only happened once despite three records of it.

As far as other methods, it might be useful to go in an define exactly what duplicate data is, allowing for you to control if something is truly unique or not.

## Part 4: Mis-typed Columns/Broken Data


In [16]:
# Question 1: find the columns where most of the data seems to be one type of data, and there seems to be a data error. In the markdown, after exploring in code, add a markdown cell discussing which columns have the data error.

Regarding type issues spawned from 'error' being found within the columns, there are three columns affected.

Blood pressure is better off as an int, bmi as floats, while outcome should be a boolean.

In [18]:
# Question 2: Remove the rows with data errors
pima_missing_fixed_df = pima_missing_fixed_df.drop(pima_missing_fixed_df[pima_missing_fixed_df['BloodPressure'] == 'Error'].index)
pima_missing_fixed_df = pima_missing_fixed_df.drop(pima_missing_fixed_df[pima_missing_fixed_df['BMI'] == 'Error'].index)
pima_missing_fixed_df = pima_missing_fixed_df.drop(pima_missing_fixed_df[pima_missing_fixed_df['Outcome'] == 'Error'].index)
len(pima_missing_fixed_df)

833

In [19]:
# Question 3: After the data is fixed in your columns change the columns to the correct type and save this as the pima_fixed_columns_df.
pima_missing_fixed_df = pima_missing_fixed_df.astype({'BloodPressure': 'int64'})
pima_missing_fixed_df = pima_missing_fixed_df.astype({'BMI': 'float64'})
pima_missing_fixed_df['Outcome'] = pima_missing_fixed_df['Outcome'].apply(lambda x: True if x == 'TRUE' else False)
pima_fixed_columns_df = pima_missing_fixed_df

In [20]:
# Question 4: Run pima_fixed_columns_df.info() to confirm the columns have changed.
pima_fixed_columns_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 833 entries, 0 to 1001
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               833 non-null    float64
 1   Glucose                   833 non-null    float64
 2   BloodPressure             833 non-null    int64  
 3   SkinThickness             833 non-null    int64  
 4   Insulin                   833 non-null    int64  
 5   BMI                       833 non-null    float64
 6   DiabetesPedigreeFunction  833 non-null    float64
 7   Age                       833 non-null    float64
 8   Outcome                   833 non-null    bool   
dtypes: bool(1), float64(5), int64(3)
memory usage: 59.4 KB


## Part 5: Outlier Detection and Removal


In [22]:
# Question 1: Print out the Outliers in each column in the pima_fixed_columns_df dataframe, use the IQR method of outlier detection.
def is_outlier(column: pd.Series):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ~column.between(lower_bound, upper_bound)

for column in pima_fixed_columns_df.columns[:-1]:
    print(pima_fixed_columns_df[is_outlier(pima_fixed_columns_df[column])])

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
5       -100.0    142.0             88             39       82  42.7   

   DiabetesPedigreeFunction    Age  Outcome  
5                     1.369  148.0     True  
Empty DataFrame
Columns: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]
Index: []
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
649          3.0    155.0          10000             35      748  32.6   
995          5.0     62.0          10000             13      412  48.7   

     DiabetesPedigreeFunction    Age  Outcome  
649                     0.098   85.0     True  
995                     0.267  101.0    False  
Empty DataFrame
Columns: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]
Index: []
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI  \
1         13.0    151.0            107   

In [42]:
# Question 2: Use loc to remove outliers in each of the columns that have outliers, save this as pima_outlier_removed_df.
for column in pima_fixed_columns_df.columns[:-1]:
    pima_fixed_columns_df = pima_fixed_columns_df.drop(pima_fixed_columns_df[is_outlier(pima_fixed_columns_df[column])].index)
pima_outlier_removed_df = pima_fixed_columns_df

In [40]:
# Question 3:  Print out the row count in the pima_outlier_removed_df and confirm this number is correct.
len(pima_outlier_removed_df)

829