In [2]:
import pandas as pd
import numpy as np
import math
import pickle
import os

In [146]:
data = pd.read_excel(f'./data/master.xlsx') # Load the Excel dataset
print('Shape of the dataset:', data.shape)
data.head(3)

Shape of the dataset: (27820, 12)


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X


In [3]:
##DataFrame for unit testing
country_name_ls = ['Albania'] *6 + ['Russia']*2 + ['India']*2
year_ls = ['2009', '2009','2010', '2011', '2011', '2012', '2010', '2012', '2009', '2010']
# sex_ls = ['m']*4 + ['f']*2 + ['m'] + ['f']*3
sex_ls = ['m']*5 + ['f']*5
test_df = pd.DataFrame()
test_df['Country'] = country_name_ls
test_df['Year'] = year_ls
test_df['Sex'] = sex_ls
print(test_df.info)
print(test_df.shape)

<bound method DataFrame.info of    Country  Year Sex
0  Albania  2009   m
1  Albania  2009   m
2  Albania  2010   m
3  Albania  2011   m
4  Albania  2011   m
5  Albania  2012   f
6   Russia  2010   f
7   Russia  2012   f
8    India  2009   f
9    India  2010   f>
(10, 3)


In [4]:
class preprocess_df:
  '''
  Class to preprocess DataFrame
  '''
  def encode_attributes(self, input_df, column_indices):
    transformed_dicts_ls = []
    transformed_df = input_df.copy(deep = True)
    column_names = transformed_df.columns.tolist()
    # print(f"{column_names = }")
    for col_iter in column_indices:
      temp_dict = {}
      temp_key = 0
      temp_ls = []
      column_name = column_names[col_iter]
      for col in transformed_df.iloc[:,col_iter].tolist():
        if col not in [*temp_dict.keys()]:
          temp_dict[col] = temp_key
          temp_key += 1
        temp_ls.append(temp_dict[col])
      dict_inv = {v:k for k,v in temp_dict.items()}
      transformed_dicts_ls.append(dict_inv)
      transformed_df[column_name] = temp_ls
    return transformed_df, transformed_dicts_ls

In [40]:
#BUC implementation
class buc_external:
    '''
    Class for implementing BUC
    '''
    def __init__(self, df, column_enc_dicts_ls, minsup):
        self.numDims = df.shape[1]
        self.cardinality = []
        self.minsup = minsup
        self.output_df = None
        self.datacounts = [[]] * df.shape[1]
        self.attribute_ls = ["*"] * df.shape[1]
        self.debug_counter = 0
        self.output_dict = {}
        self.column_enc_dicts_ls = column_enc_dicts_ls
        self.file_path = './dfs/iter_run_'

    def counting_sort(self, array_a, df_idx_ls):
      '''
      Inputs 
      array_a: List to be sorted
      df_idx_ls: Index list corresponding to the array_a. For example: DataFrame indices corresponding to array_a.
      Output
      idx_ls: Order in which df_idx_ls should be arranged so that array_a is in the sorted order.
      '''
      array_c = [0]*(max(array_a) + 1)
      idx_ls = [-1] * (len(array_a))

      # print(f"{array_a = }")
      # print(f"{array_c = }")
      for i in range(0, len(array_a)):
        array_c[array_a[i]] += 1

      for i in range(0, len(array_c) - 1):
        array_c[i+1] = array_c[i] + array_c[i+1]

      for i in range(len(array_a) - 1, -1, -1):
        array_c[array_a[i]] = array_c[array_a[i]] - 1
        idx = array_c[array_a[i]]
        idx_ls[idx] = df_idx_ls[i]

      # idx_ls = [i + min_idx for i in idx_ls]
      # print(f"{array_a = }")
      # print(f"{idx_ls = }")
      return idx_ls


    def partition(self, iteration_num, num_splits, d, bigc):
        '''
        Implements partitioning logic i.e sorts the input dataframe and populates self.datacounts
        Inputs:
        input_df: Input DataFrame
        d: column number based on which sorting is performed
        Output:
        input_df: DataFrame which is sorted according to the specified column
        '''
        #Read the entire dataframe and concatenate everything
        #################################################ONLY FOR TESTING: REPLACE THIS WITH EXTERNAL MERGE SORT########################################
        # print(f"Inside partition")
        # print(f"{iteration_num = }")
        # print(f"{num_splits = }")
        ################################################################################################################################################
        df = pd.DataFrame()
        for i_iter in range(0, num_splits):
            if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
                temp_df = pd.read_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
                df = pd.concat([df, temp_df])
            
        input_df = df
        #Sorting the dataframe
        temp_counter_dict = {}
        sorted_idx = self.counting_sort(input_df.iloc[:,d].tolist(), input_df.index.tolist())
        input_df = input_df.reindex(sorted_idx)
        #Populating self.datacounts
        for attribute in input_df.iloc[:,d].tolist():
            temp_counter_dict[attribute] = temp_counter_dict.get(attribute, 0) + 1
        self.datacounts[d] = [*temp_counter_dict.values()]
        
        #Write the sorted files back to the disk
        # split_df = np.array_split(input_df, num_splits)
        with open(f"{self.file_path}_dict_{iteration_num}.pkl", "rb") as fp:
            index_dict_t = pickle.load(fp)

        # print(f"Inside partition")
        # print(f"{index_dict_t = }")
        # print(f"{input_df = }")
        for df_name, df_idxs in index_dict_t.items():
            split_df = pd.DataFrame(input_df.iloc[df_idxs,:])
            split_df.to_pickle(f"{self.file_path}{iteration_num}{df_name}.pkl")
        return None        
        
    def compute_aggregate(self, iteration_num, num_splits):
        count = 0
        for i_iter in range(0,num_splits):
            if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
                df = pd.read_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
                count += df.shape[0]
        return count

    def find_bigc(self, iteration_num, num_splits, d): 
        '''
        Bigc refers to the cardinality of the dth attribute in the dataframe
        '''
        computed_values = []
        bigc = 0
        for i_iter in range(0, num_splits):
            if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
                df = pd.read_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
                for attribute_name in df.iloc[:,d].unique().tolist():
                    if attribute_name not in computed_values:
                        computed_values.append(attribute_name)
                        bigc += 1
        return bigc

    def split_input(self, slice_range, iteration_num):
        num_splits = 2  
        #Delete previous files of current iteration
        for i_iter in range(0, num_splits):
            if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
                os.remove(f"{self.file_path}{iteration_num}{i_iter}.pkl")
                # print(f"Inside remove function...")
                            
        # print(f"{iteration_num = }, {slice_range =}")
        index_dict_t = {} #Stores the indices that are present in each dictionary
        # print(f"{self.file_path}{iteration_num-1}0.pkl")
        if not os.path.exists(f"{self.file_path}{iteration_num-1}0.pkl"):
            #Dimension = 0
            input_df = transformed_df
            split_df = np.array_split(input_df, num_splits)
            start_idx = 0
            for i_iter, df in enumerate(split_df):
                # print(f"DF after splitting: {df}")
                df.to_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
                index_dict_t[i_iter] = [*range(start_idx, start_idx + df.shape[0])]
                start_idx += df.shape[0]   
            # print(f"{index_dict_t = }")
            with open(f"{self.file_path}_dict_{iteration_num}.pkl", "wb") as fp:
                pickle.dump(index_dict_t, fp)
        else:  
            if (slice_range[1] - slice_range[0]) < num_splits:
                df_lengths_t = [slice_range[1] - slice_range[0]]
                df_lengths_t.extend([0]*(num_splits-1))
            else:
                df_lengths_t = [math.floor((slice_range[1] - slice_range[0])/num_splits)]*num_splits
                df_lengths_t[-1] += (slice_range[1] - slice_range[0])%num_splits
            # print(f"{df_lengths_t = }")
            df_ranges_t_ls = []
            start_idx = slice_range[0]
            for df_length in df_lengths_t:
                if df_length != 0:
                    df_ranges_t_ls.append([*range(start_idx,start_idx + df_length)])
                    start_idx += df_length
            # print(f"{df_ranges_t_ls = }")
            del start_idx 
            
            with open(f"{self.file_path}_dict_{iteration_num-1}.pkl", "rb") as fp:
                index_dict_tminus = pickle.load(fp)

            # print(f"{index_dict_tminus = }")
            start_dict_idx = 0
            index_dict_t = {}
            # print(f"{index_dict_tminus = }")
            # print(f"{df_ranges_t_ls = }")
            for i_iter, df_range_t in enumerate(df_ranges_t_ls):
                start_df_idx = 0
                # print(f"{df_range_t = }")
                df = pd.DataFrame()
                for df_tminus_name, df_tminus_range in index_dict_tminus.items():
                    # print(f"{df_tminus_range = }")
                    # print(f"{start_df_idx = }")
                    common_elements = [i for i in df_range_t for j in df_tminus_range if i == j]
                    common_elements = [i-start_df_idx for i in common_elements]
                    # print(f"{df_tminus_name = }")
                    # print(f"{common_elements = }")
                    if len(common_elements) > 0:
                        temp_df = pd.read_pickle(f"{self.file_path}{iteration_num-1}{df_tminus_name}.pkl")
                        df = pd.concat([df, temp_df.iloc[common_elements,:]])
                    start_df_idx += len(df_tminus_range)
                df.to_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
                index_dict_t[i_iter] = [*range(start_dict_idx, start_dict_idx + df.shape[0])]
                start_dict_idx += df.shape[0]
                # print(f"{index_dict_t = }")
                with open(f"{self.file_path}_dict_{iteration_num}.pkl", "wb") as fp:
                    pickle.dump(index_dict_t, fp)
                # print(f"{iteration_num = }, {i_iter = }")
                # print(f"{df.info =}")                
        return num_splits

    def populate_attribute_ls(self, k, d, iteration_num):
        with open(f"{self.file_path}_dict_{iteration_num}.pkl", "rb") as fp:
            index_dict_t = pickle.load(fp)

        k_idx = k
        for df_name, df_idxs in index_dict_t.items():
            if k in df_idxs:
                df = pd.read_pickle(f"{self.file_path}{iteration_num}{df_name}.pkl")
                self.attribute_ls[d] = self.column_enc_dicts_ls[d][df.iloc[k_idx,d]]
                return None
            k_idx -= len(df_idxs)
        
        
    
    def buc_implementation(self, slice_range, dim, iteration_num):
        '''
        Function to implement BUC as indicated in the original paper. 
        Populates self.output_dict which is the output dictionary.
        NOTE:All the variable names are exactly as indicated in the original paper.
        Input
        input: Input DataFrame
        dim: Starting column for performing aggregation
        '''
        # print()
        self.debug_counter += 1
        num_splits = self.split_input(slice_range, iteration_num)
       
        # if self.debug_counter == 10:
            # return {}
        # print(f"{self.output_dict}, {self.attribute_ls}")
        # if tuple(self.attribute_ls) in [*self.output_dict.keys()]:
            # print(f"Error!!")
        # print(f"Before aggregate computation")
        # for i_iter in range(0, num_splits):
        #     if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
        #         df_debug = pd.read_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
        #         print(f"{df_debug.info}")
        aggregate = self.compute_aggregate(iteration_num, num_splits)
        # print(f"AGGREGATE: {aggregate}")
        # print(f"{self.attribute_ls = }")
        self.output_dict[tuple(self.attribute_ls)] = aggregate
        # if self.debug_counter == 10:
            # return 
        # print(f"{dim = }")
        for d in range(dim, self.numDims,1):
            # bigc = input.iloc[:,d].nunique()
            bigc = self.find_bigc(iteration_num, num_splits, d)
            # print(f"{d = }")
            # print(f"{bigc = }")
            # print(f"{d= }, {bigc=}")
            # print(f"Input before partitioning: {input}")
            # print(f"Before partition")
            # for i_iter in range(0, num_splits):
            #     if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
            #         df_debug = pd.read_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
            #         print(f"{df_debug.info}")
            self.partition(iteration_num, num_splits, d, bigc)
            # print(f"{iteration_num = }")
            # print(f"After partition")
            # for i_iter in range(0, num_splits):
            #     if os.path.exists(f"{self.file_path}{iteration_num}{i_iter}.pkl"):
            #         df_debug = pd.read_pickle(f"{self.file_path}{iteration_num}{i_iter}.pkl")
            #         print(f"{df_debug.info}")
            # return {}
            # print(f"Input after partitioning on {d}: {input}")
            # print(f"{self.datacounts = }")
            k = 0
            for i in range(0, bigc, 1):
                # print(f"{iteration_num = }, {i = }, {bigc = }")
                # print(f"################Inside i loop######################")
                # print(f"{input = }")
                # print(f"{d = }, {i = }")
                # print(f"{self.datacounts = }")
                smallc = self.datacounts[d][i]
                # print(f"{smallc = }")
                if smallc >= self.minsup:
                    # print(f"**********************Inside if condition***********************")
                    # print(f"k:{k}, d:{d}")
                    # print(f"Attribute: {input.iloc[k,d]}")
                    # print(f"{transformed_dicts[d] = }")
                    # input = transformed_df
                    # input = transformed_df
                    # self.attribute_ls[d] = self.column_enc_dicts_ls[d][input.iloc[k,d]]
                    # del input
                    # print(f"{self.attribute_ls = }")
                    # print(f"smallc: {smallc}")
                    # self.buc_implementation(input.iloc[k:k+smallc,:], dim=d+1, iteration_num=iteration_num+1)
                    self.populate_attribute_ls(k, d, iteration_num)
                    self.buc_implementation(slice_range = [k,k+smallc], dim=d+1, iteration_num=iteration_num+1)
                    # if self.debug_counter == 10:
                        # return {}
                    # print(f"d inside if condition: {d}")
                    # print(f"******************************************************************")
                k += smallc
            # print(f"#################################################################")
            # print(f"ALL:")
            # print(f"k:{k}, d:{d}")
            self.attribute_ls[d] = "*"
            # print(f"ALL")

In [41]:
## Parameter
minsup = 1
input_df = test_df
# print(f"input_df: {input_df}")
preprocess_obj = preprocess_df()
transformed_df, column_enc_dicts_ls = preprocess_obj.encode_attributes(input_df, [*range(0,input_df.shape[1])]) #NOTE: This should be modified as required
print(column_enc_dicts_ls)
buc_obj = buc_external(transformed_df, column_enc_dicts_ls, minsup)
buc_obj.buc_implementation([0,transformed_df.shape[0]], 0, 0)
output_dict = buc_obj.output_dict

[{0: 'Albania', 1: 'Russia', 2: 'India'}, {0: '2009', 1: '2010', 2: '2011', 3: '2012'}, {0: 'm', 1: 'f'}]


In [42]:
output_dict_transformed = {}
columns_ls = input_df.columns.tolist()
for column in columns_ls:
    output_dict_transformed[column] = []
    output_dict_transformed['count'] = []
for tuple_key, value in output_dict.items():
    output_dict_transformed['count'].append(value)
    for tuple_key_iter in range(0,len(tuple_key)):
        output_dict_transformed[columns_ls[tuple_key_iter]].append(tuple_key[tuple_key_iter])
# print(f"{output_dict = }")
# print(f"{output_dict_transformed = }")
output_df = pd.DataFrame.from_dict(output_dict_transformed)
# columns_order = ['country', 'year', 'sex', 'age', 'generation', 'suicides_range', 'population_range', 'gdp_per_year_income_range', 'count']
columns_order = ['Country', 'Year', 'Sex', 'count']
output_df = output_df.reindex(columns = columns_order)
output_df

Unnamed: 0,Country,Year,Sex,count
0,*,*,*,10
1,Albania,*,*,6
2,Albania,2009,*,2
3,Albania,2009,m,2
4,Albania,2010,*,1
5,Albania,2010,m,1
6,Albania,2011,*,2
7,Albania,2011,m,2
8,Albania,2012,*,1
9,Albania,2012,f,1
