In [1]:
import pandas as pd

# Define the data
data = {
    "Pregnancies": [6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1, 3, 8, 7, 9],
    "Glucose": [148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139, 189, 166, 100, 118, 107, 103, 115, 126, 99, 196, 119],
    "BloodPressure": [72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74, 80, 60, 72, 0, 84, 74, 30, 70, 88, 84, 90, 80],
    "SkinThickness": [35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, 23, 19, 0, 47, 0, 38, 30, 41, 0, 0, 35],
    "Insulin": [0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, 846, 175, 0, 230, 0, 83, 96, 235, 0, 0, 0],
    "BMI": [33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31, 35.3, 30.5, 0, 37.6, 38, 27.1, 30.1, 25.8, 30, 45.8, 29.6, 43.3, 34.6, 39.3, 35.4, 39.8, 29],
    "DiabetesPedigreeFunction": [0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158, 0.232, 0.191, 0.537, 1.441, 0.398, 0.587, 0.484, 0.551, 0.254, 0.183, 0.529, 0.704, 0.388, 0.451, 0.263],
    "Age": [50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 32, 31, 31, 33, 32, 27, 50, 41, 29],
    "Outcome": [1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Aggregate data based on the 'Glucose' column to get insights
glucose_agg = df.groupby('Glucose').agg({
    'Pregnancies': ['mean', 'min', 'max'],
    'BloodPressure': ['mean', 'min', 'max'],
    'SkinThickness': ['mean', 'min', 'max'],
    'Insulin': ['mean', 'min', 'max'],
    'BMI': ['mean', 'min', 'max'],
    'DiabetesPedigreeFunction': ['mean', 'min', 'max'],
    'Age': ['mean', 'min', 'max'],
    'Outcome': ['mean', 'sum']
}).reset_index()

# Display the aggregated data
print("\nAggregated Data Based on Glucose Levels:")
print(glucose_agg)

# Additional insights: Summary statistics of the 'Glucose' column
glucose_summary = df['Glucose'].describe()
print("\nSummary Statistics for the 'Glucose' Column:")
print(glucose_summary)


First few rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Aggregated Data Based on Glucose Levels:
   Glucose Pregnancies         BloodPressure         SkinThickness          \
                  mean min max          mean min max          mean min max   
0       78         3.0   3   3          50.0  50  50          32.0  32  32

In [2]:
# B SOLUTION
# Identify duplicate rows
duplicates = df.duplicated()

# Display duplicate rows
print("Duplicate Rows:")
print(df[duplicates])

# Remove duplicate rows
df_cleaned = df.drop_duplicates()

# Display the DataFrame after removing duplicates
print("\nDataFrame after Removing Duplicates:")
print(df_cleaned) 

Duplicate Rows:
Empty DataFrame
Columns: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]
Index: []

DataFrame after Removing Duplicates:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0             6      148             72             35        0  33.6   
1             1       85             66             29        0  26.6   
2             8      183             64              0        0  23.3   
3             1       89             66             23       94  28.1   
4             0      137             40             35      168  43.1   
5             5      116             74              0        0  25.6   
6             3       78             50             32       88  31.0   
7            10      115              0              0        0  35.3   
8             2      197             70             45      543  30.5   
9             8      125             96              0        0   0.0   
1

###### C SOLUTION
# Filter rows where Glucose value > 120
filtered_df = df[df['Glucose'] > 120]

# Display the filtered DataFrame
print("Rows with Glucose value > 120:")
print(filtered_df)

In [4]:
#D SOLUTION
#Calculate the correlation between Age and Glucose
correlation = df['Age'].corr(df['Glucose'])

# Display the correlation
print("Correlation between Age and Glucose Level:")
print(correlation)

Correlation between Age and Glucose Level:
0.5626614086585768
