# Bottom-Up Cube (BUC) Algorithm

In [1]:
%pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from itertools import combinations
from memory_profiler import profile

In [3]:
data = pd.read_csv('Electric_Vehicle_Data.csv')
data.head(5)

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,WAUTPBFF4H,King,Seattle,WA,98126.0,2017,AUDI,A3,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,16,0,34.0,235085336,POINT (-122.374105 47.54468),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0
1,WAUUPBFF2J,Thurston,Olympia,WA,98502.0,2018,AUDI,A3,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,16,0,22.0,237896795,POINT (-122.943445 47.059252),PUGET SOUND ENERGY INC,53067010000.0
2,5YJSA1E22H,Thurston,Lacey,WA,98516.0,2017,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,210,0,22.0,154498865,POINT (-122.78083 47.083975),PUGET SOUND ENERGY INC,53067010000.0
3,1C4JJXP62M,Thurston,Tenino,WA,98589.0,2021,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,25,0,20.0,154525493,POINT (-122.85403 46.856085),PUGET SOUND ENERGY INC,53067010000.0
4,5YJ3E1EC9L,Yakima,Yakima,WA,98902.0,2020,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,308,0,14.0,225996361,POINT (-120.524012 46.5973939),PACIFICORP,53077000000.0


In [4]:
print(len(data))

181458


In [5]:
class Generalizer:
  def __init__(self, df) -> None:
    self.data = df.copy(deep=True)

  def fill_zero_with_nan(self, target_column):
    self.data.loc[self.data[target_column] == 0, target_column] = np.nan

  def generalise_continous_to_categorical(self, target_column, no_of_categories, labels=None):
    bins = pd.qcut(self.data[target_column], no_of_categories, retbins=True)[1]

    if labels == None:
      labels = [f"{int(bins[i])}-{int(bins[i+1])}" for i in range(len(bins)-1)]
    else:
      labels = [f"{labels[i]} ({int(bins[i])}-{int(bins[i+1])})" for i in range(len(bins)-1)]

    self.data[target_column] = pd.cut(self.data[target_column], bins=bins, include_lowest=True, labels=labels)

  def generalize_year(self):
    self.generalise_continous_to_categorical('Model Year', no_of_categories=3)

  def generalize_range(self):
    self.fill_zero_with_nan('Electric Range')
    self.generalise_continous_to_categorical('Electric Range', no_of_categories=3, labels=['Short Range', 'Medium Range', 'Long Range'] )

  def generalize_Electric_Utility(self):
    self.data.dropna(subset=['Electric Utility'], inplace=True)
    self.data['Electric Utility'] = self.data['Electric Utility'].apply(lambda x: x.split('||')[0])

  def generalize_ev_type(self):
    mp = {
      'Battery Electric Vehicle (BEV)': 'BEV',
      'Plug-in Hybrid Electric Vehicle (PHEV)': 'PHEV'
    }
    self.data['Electric Vehicle Type'] = self.data['Electric Vehicle Type'].apply(lambda x: mp[x])

  def generalize_cafv_eligibility(self):
    mp = {
      'Not eligible due to low battery range': 'NO',
      'Eligibility unknown as battery range has not been researched': 'NO',
      'Clean Alternative Fuel Vehicle Eligible': 'YES'
    }
    self.data['Clean Alternative Fuel Vehicle (CAFV) Eligibility'] = self.data['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].apply(lambda x: mp[x])

  def remove_unused_fields(self):
    unnecessary_fields = ['VIN (1-10)', 'City', 'State', 'Model', 'Postal Code', 'Base MSRP', 'DOL Vehicle ID', 'Vehicle Location', 'Legislative District', '2020 Census Tract']
    self.data.drop(labels=unnecessary_fields, inplace=True, axis=1)

  def generalize(self):
    self.generalize_year()
    self.generalize_range()
    self.generalize_Electric_Utility()
    self.generalize_ev_type()
    self.generalize_cafv_eligibility()

  def get_generalised(self):
    return self.data

In [6]:
def get_pivot_table(df, index_col, other_cols):
  pivot_table = pd.pivot_table(df, index=index_col, columns=other_cols, aggfunc='size', fill_value=0)
  pivot_table['Total'] = pivot_table.sum(axis=1)
  pivot_table.loc['Total'] = pivot_table.sum(axis=0)
  return pivot_table

In [7]:
gen = Generalizer(data)

# print(gen.get_generalised()['Base MSRP'].unique())

gen.remove_unused_fields()
gen.generalize()
temp = gen.get_generalised()

In [8]:
evType_vs_range = get_pivot_table(temp, index_col='Electric Vehicle Type', other_cols='Electric Range')

evType_vs_range

Electric Range,Short Range (6-37),Medium Range (37-208),Long Range (208-337),Total
Electric Vehicle Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BEV,9,19149,28082,47240
PHEV,29143,10342,0,39485
Total,29152,29491,28082,86725


In [9]:
evType_vs_utility = get_pivot_table(temp, index_col='Electric Vehicle Type', other_cols='Electric Utility')

evType_vs_utility

Electric Utility,AVISTA CORP,BONNEVILLE POWER ADMINISTRATION,CITY OF BLAINE - (WA),CITY OF CHENEY - (WA),CITY OF CHEWELAH,CITY OF SEATTLE - (WA),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),CITY OF SUMAS - (WA),CITY OF TACOMA - (WA),MODERN ELECTRIC WATER COMPANY,...,PACIFICORP,PORTLAND GENERAL ELECTRIC CO,PUD NO 1 OF CHELAN COUNTY,PUD NO 1 OF DOUGLAS COUNTY,PUD NO 1 OF OKANOGAN COUNTY,PUD NO 1 OF PEND OREILLE COUNTY,PUD NO 1 OF WHATCOM COUNTY,PUD NO 2 OF GRANT COUNTY,PUGET SOUND ENERGY INC,Total
Electric Vehicle Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BEV,269,25350,153,35,5,1,25633,4,232,827,...,930,4,856,280,68,36,25,415,86303,141970
PHEV,140,9653,40,10,4,0,6379,3,48,327,...,411,1,222,87,37,20,16,170,21672,39485
Total,409,35003,193,45,9,1,32012,7,280,1154,...,1341,5,1078,367,105,56,41,585,107975,181455


In [10]:
evType_vs_cafv = get_pivot_table(temp, index_col='Electric Vehicle Type', other_cols='Clean Alternative Fuel Vehicle (CAFV) Eligibility')

evType_vs_cafv

Clean Alternative Fuel Vehicle (CAFV) Eligibility,NO,YES,Total
Electric Vehicle Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BEV,94739,47231,141970
PHEV,19903,19582,39485
Total,114642,66813,181455


In [11]:
def clean_df(df):
  remove_columns = ['VIN (1-10)','State','Postal Code','Electric Range','Base MSRP', 'Legislative District', 'DOL Vehicle ID','Vehicle Location', '2020 Census Tract']
  df = df.drop(columns=remove_columns)
  df = df.dropna()
  df = df.reset_index(drop=True)
  return df

In [12]:
columns = data.columns
print('Original columns:',columns)
data = clean_df(data)
print('New columns:', data.columns)

Original columns: Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
       'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
      dtype='object')
New columns: Index(['County', 'City', 'Model Year', 'Make', 'Model',
       'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility',
       'Electric Utility'],
      dtype='object')


In [13]:
print(data.head(10))

      County               City  Model Year       Make     Model  \
0       King            Seattle        2017       AUDI        A3   
1   Thurston            Olympia        2018       AUDI        A3   
2   Thurston              Lacey        2017      TESLA   MODEL S   
3   Thurston             Tenino        2021       JEEP  WRANGLER   
4     Yakima             Yakima        2020      TESLA   MODEL 3   
5   Thurston            Olympia        2023       JEEP  WRANGLER   
6     Kitsap            Keyport        2017  CHEVROLET      VOLT   
7  Snohomish  Mountlake Terrace        2020      TESLA   MODEL 3   
8       King            Seattle        2022       AUDI        Q5   
9   Thurston            Olympia        2017  CHEVROLET      VOLT   

                    Electric Vehicle Type  \
0  Plug-in Hybrid Electric Vehicle (PHEV)   
1  Plug-in Hybrid Electric Vehicle (PHEV)   
2          Battery Electric Vehicle (BEV)   
3  Plug-in Hybrid Electric Vehicle (PHEV)   
4          Battery Electri

## BUC Implementation - In Memory

In [14]:
def buc(data, dimensions, min_support=0, prefix=()):
    if len(dimensions) == 0:
        count = len(data)
        if count >= min_support:
            result = {tuple(prefix): count}
        else:
            result = {}
    else:
        dim = dimensions[0]
        rest_dims = dimensions[1:]
        dim_values = data[dim].unique()

        result = {}
        for value in dim_values:
            subset = data[data[dim] == value]
            new_prefix = prefix + ((dim, value),)
            subresult = buc(subset, rest_dims, min_support, new_prefix)
            result.update(subresult)

        if len(dim_values) > 1:
            all_prefix = prefix + ((dim, 'ALL'),)
            all_result = buc(data, rest_dims, min_support, all_prefix)
            result.update(all_result)

    return result

In [15]:
dimensions = [column for column in data.columns]
minsup=10000
measures = ['Count']
df=pd.DataFrame(data)
result = buc(df, dimensions, minsup)

In [16]:
result_list = []
for key, value in result.items():
    row_data = {dim: dim_value for dim, dim_value in key}
    row_data[measures[0]] = value
    result_list.append(row_data)

result_df = pd.DataFrame(result_list, columns=dimensions+measures)
result_df.to_csv('buc_without_optimization.csv', encoding='utf-8', index=False, header=True)
result_df.head(10)

Unnamed: 0,County,City,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Utility,Count
0,King,Seattle,ALL,TESLA,ALL,Battery Electric Vehicle (BEV),ALL,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),11347
1,King,Seattle,ALL,TESLA,ALL,Battery Electric Vehicle (BEV),ALL,ALL,11900
2,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),14556
3,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,ALL,15254
4,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),ALL,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),22829
5,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),ALL,ALL,23961
6,King,Seattle,ALL,ALL,ALL,ALL,Clean Alternative Fuel Vehicle Eligible,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),10955
7,King,Seattle,ALL,ALL,ALL,ALL,Clean Alternative Fuel Vehicle Eligible,ALL,11543
8,King,Seattle,ALL,ALL,ALL,ALL,Eligibility unknown as battery range has not b...,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),14556
9,King,Seattle,ALL,ALL,ALL,ALL,Eligibility unknown as battery range has not b...,ALL,15254


## BUC Implementation - Out-of-Memory

In [22]:
def buc_withpaging(data, dimensions, min_support=0, chunk_size=None):
    def process(data, dimensions, prefix=()):
      if len(dimensions) == 0:
        count = len(data)
        result = {tuple(prefix): count}
      else:
          dim = dimensions[0]
          rest_dims = dimensions[1:]
          dim_values = data[dim].unique()

          result = {}
          for value in dim_values:
              subset = data[data[dim] == value]
              new_prefix = prefix + ((dim, value),)
              subresult = process(subset, rest_dims, new_prefix)
              result.update(subresult)

          if len(dim_values) > 1:
              all_prefix = prefix + ((dim, 'ALL'),)
              all_result = process(data, rest_dims, all_prefix)
              result.update(all_result)

      return result

    result = {}
    i = 0
    for chunk in pd.read_csv('Electric_Vehicle_Data.csv', chunksize=chunk_size):
        print(i)
        i = i+1

        chunk_result = process(chunk, dimensions)

        for key, value in chunk_result.items():
          if key in result:
              result[key] += value
          else:
              result[key] = value

    result = {k: v for k, v in result.items() if v >= min_support}

    return result

In [23]:
dimensions = [column for column in data.columns]
minsup=10000
measures = ['Count']
prefix = ()
chunk_size = 20000
df=pd.DataFrame(data)
result = buc_withpaging(df, dimensions, minsup, chunk_size)

0
1
2
3
4
5
6
7
8
9


In [24]:
result_list = []
for key, value in result.items():
    row_data = {dim: dim_value for dim, dim_value in key}
    row_data[measures[0]] = value
    result_list.append(row_data)

result_df = pd.DataFrame(result_list, columns=dimensions+measures)
result_df.to_csv('buc_with_pagination.csv', encoding='utf-8', index=False, header=True)
result_df.head(10)

Unnamed: 0,County,City,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Utility,Count
0,King,Seattle,ALL,TESLA,ALL,Battery Electric Vehicle (BEV),ALL,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),11347
1,King,Seattle,ALL,TESLA,ALL,Battery Electric Vehicle (BEV),ALL,ALL,11900
2,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),14556
3,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,ALL,15254
4,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),ALL,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),22829
5,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),ALL,ALL,23961
6,King,Seattle,ALL,ALL,ALL,ALL,Clean Alternative Fuel Vehicle Eligible,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),10955
7,King,Seattle,ALL,ALL,ALL,ALL,Clean Alternative Fuel Vehicle Eligible,ALL,11543
8,King,Seattle,ALL,ALL,ALL,ALL,Eligibility unknown as battery range has not b...,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),14556
9,King,Seattle,ALL,ALL,ALL,ALL,Eligibility unknown as battery range has not b...,ALL,15254


## Performance Analysis

### Plot of minsup vs runtime, keeping allotted memory fixed

In [None]:
minsup_list = [1000, 5000, 10000, 20000, 50000]
time_list = []

for minsup in minsup_list:
    start_time = time.time()
    prefix = ()
    chunk_size = 1000
    result = buc_withpaging(df, dimensions, minsup, prefix, chunk_size)
    end_time = time.time()
    time_list.append(end_time - start_time)

plt.plot(minsup_list, time_list)
plt.xlabel('minsup')
plt.ylabel('time')
plt.title('Time vs minsup')
plt.grid(True)
plt.show()

KeyboardInterrupt: 

### Plot of allotted memory vs. runtime, keeping minsup fixed

In [None]:
chunk_size_list = range(1000,3000,200)
time_list = []

for chunk_size in chunk_size_list:
    start_time = time.time()
    prefix = ()
    minsup = 10000
    result = buc_withpaging(df, dimensions, minsup, prefix, chunk_size)
    end_time = time.time()
    time_list.append(end_time - start_time)

plt.plot(chunk_size_list, time_list)
plt.xlabel('chunk_size')
plt.ylabel('time')
plt.title('Time vs chunk_size')
plt.grid(True)
plt.show()

## Optimization Technique

## BUC - Apriori Pruning

In [17]:
def buc_pruning(data, dimensions, min_support=0, prefix=()):
    if len(dimensions) == 0:
        count = len(data)
        if count >= min_support:
            result = {tuple(prefix): count}
        else:
            result = {}
    else:
        dim = dimensions[0]
        rest_dims = dimensions[1:]
        dim_values = data[dim].unique()

        result = {}
        for value in dim_values:
            subset = data[data[dim] == value]
            count = len(subset)
            # Prune branches where the count is already less than min_support
            if count >= min_support:
                new_prefix = prefix + ((dim, value),)
                subresult = buc_pruning(subset, rest_dims, min_support, new_prefix)
                result.update(subresult)

        # Prune branches where the count is already less than min_support
        if len(dim_values) > 1:
            all_prefix = prefix + ((dim, 'ALL'),)
            all_count = len(data)
            if all_count >= min_support:
                all_result = buc_pruning(data, rest_dims, min_support, all_prefix)
                result.update(all_result)

    return result

In [18]:
dimensions = [column for column in data.columns]
minsup=10000
measures = ['Count']
df=pd.DataFrame(data)
result = buc_pruning(df, dimensions, minsup)

In [19]:
result_list = []
for key, value in result.items():
    row_data = {dim: dim_value for dim, dim_value in key}
    row_data[measures[0]] = value
    result_list.append(row_data)

result_df = pd.DataFrame(result_list, columns=dimensions+measures)
# result_df = result_df.sort_values(by=measures[0], ascending=False)
result_df.to_csv('buc_with_pruning.csv', encoding='utf-8', index=False, header=True)
result_df.head(10)

Unnamed: 0,County,City,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Utility,Count
0,King,Seattle,ALL,TESLA,ALL,Battery Electric Vehicle (BEV),ALL,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),11347
1,King,Seattle,ALL,TESLA,ALL,Battery Electric Vehicle (BEV),ALL,ALL,11900
2,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),14556
3,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,ALL,15254
4,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),ALL,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),22829
5,King,Seattle,ALL,ALL,ALL,Battery Electric Vehicle (BEV),ALL,ALL,23961
6,King,Seattle,ALL,ALL,ALL,ALL,Clean Alternative Fuel Vehicle Eligible,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),10955
7,King,Seattle,ALL,ALL,ALL,ALL,Clean Alternative Fuel Vehicle Eligible,ALL,11543
8,King,Seattle,ALL,ALL,ALL,ALL,Eligibility unknown as battery range has not b...,CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),14556
9,King,Seattle,ALL,ALL,ALL,ALL,Eligibility unknown as battery range has not b...,ALL,15254
