In [None]:
'''
Instructions:
In this assignment you will experiment on your own. 
Using a health dataset of your choice (check with us if you are not sure),
write code to demonstrate the following Pandas functions:

Melt
Pivot
Aggregation
Iteration
Groupby

Here are some datasets you can use if you donâ€™t have one:
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer Links to an external site.
https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008Links to an external site.
https://archive.ics.uci.edu/ml/datasets/ArrhythmiaLinks to an external site.
Each function demonstration will be for 30 points for a total of 150 points. 
Ensure that you include comments within your code and follow the rubric as a guide. 
Submit using your GitHub site. Ask if you have any questions.
'''

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd

In [17]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# data (as pandas dataframes) 
X = breast_cancer.data.features 
y = breast_cancer.data.targets 
df = pd.concat([X, y], axis=1) #combine x & y 

# metadata 
print(breast_cancer.metadata) 
  
# variable information 
print(breast_cancer.variables) 

print(df.columns.tolist()

{'uci_id': 14, 'name': 'Breast Cancer', 'repository_url': 'https://archive.ics.uci.edu/dataset/14/breast+cancer', 'data_url': 'https://archive.ics.uci.edu/static/public/14/data.csv', 'abstract': 'This breast cancer domain was obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia. This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. (See also lymphography and primary-tumor.)', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 286, 'num_features': 9, 'feature_types': ['Categorical'], 'demographics': ['Age'], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Mar 07 2024', 'dataset_doi': '10.24432/C51P4M', 'creators': ['Matjaz Zwitter', 'Milan Soklic'], 'intro_paper': None, 'additional_info': {'summary': 'Thi

In [57]:
#print 10 rows of dataframe
print(df[:10])


     age menopause tumor-size inv-nodes node-caps  deg-malig breast  \
0  30-39   premeno      30-34       0-2        no          3   left   
1  40-49   premeno      20-24       0-2        no          2  right   
2  40-49   premeno      20-24       0-2        no          2   left   
3  60-69      ge40      15-19       0-2        no          2  right   
4  40-49   premeno        0-4       0-2        no          2  right   
5  60-69      ge40      15-19       0-2        no          2   left   
6  50-59   premeno      25-29       0-2        no          2   left   
7  60-69      ge40      20-24       0-2        no          1   left   
8  40-49   premeno      50-54       0-2        no          2   left   
9  40-49   premeno      20-24       0-2        no          2  right   

  breast-quad irradiat                 Class  large_tumor  
0    left_low       no  no-recurrence-events         True  
1    right_up       no  no-recurrence-events        False  
2    left_low       no  no-recurrence-

In [56]:
#MELT
melt_df = pd.melt(df,id_vars=['Class'],
                    value_vars=['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat'],
                    var_name='Feature',
                    value_name='Value')  
print ('MELT:')
print(melt_df.head())

#can also be written
#melt everything but target(Class)
#melt_df1 = pd.melt(df,id_vars=['Class'], 
                    #value_vars=df.columns.drop('Class'),
                    #var_name='Feature',
                    #value_name='Value')
#print(melt_df1)


MELT:
                  Class Feature  Value
0  no-recurrence-events     age  30-39
1  no-recurrence-events     age  40-49
2  no-recurrence-events     age  40-49
3  no-recurrence-events     age  60-69
4  no-recurrence-events     age  40-49


In [55]:
#PIVOT (number of patients by menopause category and Class)
pivot_df = df.pivot_table(
    values='age',  
    index='menopause',
    columns='Class',
    aggfunc='count',
    fill_value=0)

print ("PIVOT:")
print(pivot_df)


PIVOT:
Class      no-recurrence-events  recurrence-events
menopause                                         
ge40                         94                 35
lt40                          5                  2
premeno                     102                 48


In [58]:
#AGGREGATION (number of patients in each age group)
age_counts = df['age'].value_counts().sort_index()

print("AGGREGATION:")
print(age_counts)


AGGREGATION:
age
20-29     1
30-39    36
40-49    90
50-59    96
60-69    57
70-79     6
Name: count, dtype: int64


In [59]:
#ITERATE (tumor-size equal to or greater to 30-34/ large tumors)
df['large_tumor'] = False

for i, row in df.iterrows():
    if row['tumor-size'] in ['30-34', '35-39', '40-44', '45-49', '50-54', '55-59']:
        df.at[i, 'large_tumor'] = True
        
print("ITERATE:")
print(df[['tumor-size', 'large_tumor']].head())


ITERATE:
  tumor-size  large_tumor
0      30-34         True
1      20-24        False
2      20-24        False
3      15-19        False
4        0-4        False


In [61]:
#GROUPBY (number of patients by breast quadrant and class)

print("GROUPBY:")
print(df.groupby(['breast-quad', 'Class']).size())


GROUPBY:
breast-quad  Class               
central      no-recurrence-events    17
             recurrence-events        4
left_low     no-recurrence-events    75
             recurrence-events       35
left_up      no-recurrence-events    71
             recurrence-events       26
right_low    no-recurrence-events    18
             recurrence-events        6
right_up     no-recurrence-events    20
             recurrence-events       13
dtype: int64
