# Integral Data Exercise for Senior Data Scientist position
Bryan Gonzalez, PhD


In [1]:
import pandas as pd
import numpy as np
import random
import string
from sklearn.cluster import KMeans


In [7]:
n_records = 10000
dat = {
    'first_name': [x + '_' + random.choice(string.ascii_lowercase) for x in [random.choice(['John', 'Jane', 'Alice', 'Bob']) for _ in range(n_records)]], # adding some variability with an extra letter
    'last_name': [x + '_' + random.choice(string.ascii_lowercase) for x in [random.choice(['Doe', 'Smith', 'Johnson', 'Brown']) for _ in range(n_records)]], # adding some variability with an extra letter
    'zip_code': np.random.randint(10000, 99999, n_records).tolist(),
    'age': np.random.randint(18, 99, n_records).tolist(),
    'sex': np.random.choice(['M', 'F'], n_records).tolist(),
    'hospital_name': np.random.choice(['General Hospital', 'City Hospital', 'Central Clinic', 'County Hospital'], n_records).tolist(),
    'lab_result': np.random.randint(60, 110, n_records).tolist(),
}
bigdata = pd.DataFrame(dat)

In [3]:
target =  'zip_code'
size = .2
nbins = 5
strata = pd.qcut(bigdata[target], q=nbins, duplicates='drop', labels=False)
sampsize = n_records * .2
#xx =int(np.rint(sampsize*len(x)/n_records))
xx = bigdata.groupby(strata, group_keys=False).apply(lambda x: x.sample(int(np.rint(sampsize*len(x)/n_records)))).reset_index()
xx#['hospital_name'].value_counts(True).round(2)

Unnamed: 0,index,first_name,last_name,zip_code,age,sex,hospital_name,lab_result
0,48,Alice_y,Doe_e,16809,65,M,County Hospital,101
1,61,Jane_p,Brown_c,24251,72,F,County Hospital,64
2,42,Jane_e,Johnson_e,23984,77,F,Central Clinic,87
3,87,Bob_b,Brown_z,13190,36,M,City Hospital,64
4,49,Alice_g,Doe_p,28176,76,M,City Hospital,105
5,34,John_s,Doe_c,40159,84,F,General Hospital,73
6,63,John_u,Brown_p,38388,35,F,City Hospital,66
7,26,Alice_k,Smith_h,36306,89,M,Central Clinic,93
8,25,Bob_i,Brown_b,57597,60,F,Central Clinic,71
9,98,Alice_j,Doe_i,52084,76,F,County Hospital,96


In [None]:
n_clusters = 4
target_col = 'age'
size = .2

X = bigdata[target_col].dropna()
kmeans = KMeans(n_clusters=n_clusters).fit(X)
bigdata['cluster'] = kmeans.predict(bigdata[[target_col]])
# Calculate size of each cluster
cluster_sizes = bigdata.groupby('cluster').size()

# Calculate proportion of samples to take from each cluster
proportions = cluster_sizes / len(n_records)

In [23]:
class ClinicalData:
    '''
    Clinical Data Class

    Attributes:
        data: pd Dataframe, full dataset
        n_records:
        sample: pd Dataframe
    '''
    def __init__(self, data):
        self.data = data.drop_duplicates()
        self.n_records = len(self.data)
        self.sample = None
        self.sample_method = None
        self.k_anon = None

        return

    def randomSample(self, size=.2):
        self.sample = self.data.sample(frac=size)
        self.sample_method = 'random'
        print(f'{size *100} % of dataset sampled randomly')
        return

    def stratifiedSample(self, target_col, nbins=2, size=.2):
        """
         Perform stratified sampling on a pandas DataFrame based on a target column,
         sampling the data and using a specified number of strata.

        df: pandas DataFrame to sample from
        target_col: column to use for stratification
        nbins: number of strata to use
        size: float, % of total data to sample
        """
        if self.data[target_col].dtype == 'object': # stratification based on discrete variable
            strata = self.data[target_col]
        else: #stratification based on continuous variable
            strata = pd.qcut(self.data[target_col], q=nbins, duplicates='drop', labels=False)
        sampsize= self.n_records * size
        self.sample = self.data.groupby(strata, group_keys=False).apply(lambda x: x.sample(int(np.rint(sampsize*len(x)/n_records)))).reset_index(drop=True)
        self.sample_method = 'stratified_'+ target_col
        print(f'{size *100} % of dataset sampled with stratification by {target_col} into {nbins} levels. ')
        return

    def clusterSample(self, n_clusters, target_col, size=.2):
        """
        Perform clustered sampling on a pandas DataFrame based on a target column.
        df: pandas DataFrame to sample from
        num_clusters: number of clusters to use
        target_col: column to use for clustering
        size: float. % of total data to sample
        """
        # Fit KMeans clustering model
        X = self.data[[target_col]].dropna()
        kmeans = KMeans(n_clusters=n_clusters).fit(X)

        # Assign cluster labels to data
        self.data['cluster'] = kmeans.predict(self.data[[target_col]])

        # Calculate size of each cluster
        cluster_sizes = self.data.groupby('cluster').size()

        # Calculate proportion of samples to take from each cluster
        proportions = cluster_sizes / self.n_records

        # Sample from each cluster
        sampled_rows = []
        sample_size = self.n_records * size
        for cluster, proportion in proportions.items():
            cluster_size = int(proportion * sample_size)
            cluster_df = self.data[self.data['cluster'] == cluster]
            sampled_rows.append(cluster_df.sample(n=cluster_size))

        # Combine sampled rows into final DataFrame
        sampled_df = pd.concat(sampled_rows)

        # Remove cluster label column
        self.sample = sampled_df.drop('cluster', axis=1)
        self.data.drop('cluster', axis=1, inplace=True)
        self.sample_method = 'cluster_'+ target_col
        print(f'{size *100} % of dataset sampled with clustering over {target_col} into {n_clusters} clusters. ')
        return

    def k_anonymity(self, cols=['zip_code', 'age']):
        if self.sample_method is not None:
            grouped = self.sample.groupby(cols).size().reset_index(name='count')
            n = 'sampled'
        else:
            grouped = self.data.groupby(cols).size().reset_index(name='count')
            n = 'full'
        self.k_anon = grouped['count'].min()
        return print(f'K Anonymity of {n} data is :{self.k_anon}')

## 1. K-Anonymity with Random Sampling

In [27]:
cd = ClinicalData(bigdata)
cd.randomSample( size=.2)
cd.k_anonymity()

20.0 % of dataset sampled randomly
K Anonymity of sampled data is :1


## 2. K-Anonymity with Stratified Sampling by Age


In [25]:
cd = ClinicalData(bigdata)
cd.stratifiedSample(target_col='age', size=.2, nbins=5)
cd.k_anonymity()

20.0 % of dataset sampled with stratification by age into 5 levels. 
K Anonymity of sampled data is :1


## 3. K-Anonymity with Cluster Sampling by Zip code


In [26]:
cd = ClinicalData(bigdata)
cd.clusterSample(target_col='zip_code', n_clusters=10,  size=.2)
cd.k_anonymity()

20.0 % of dataset sampled with clustering over zip_code into 10 clusters. 
K Anonymity of sampled data is :1
