# CS 3110 Final Project
### Adelaide Bonner

In [45]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

sets = pd.read_csv('https://github.com/A-Bonner/CS3110finalproject/raw/main/sets.csv')
sets = sets.dropna()

def pick_b_up(bs, col, epsilon, default):
    last_result = 0

    for b in bs:
        clipped_sum = sets[col].clip(upper=b).sum()
        result = laplace_mech(clipped_sum, sensitivity=b, epsilon=epsilon)
        if result < last_result:
            return b
        else:
            last_result = result
    print('No good clipping parameter found')
    return default

def pick_b_down(bs, col, epsilon, default):
    last_result = 0

    for b in bs:
        clipped_sum = sets[col].clip(lower=b).sum()
        result = laplace_mech(clipped_sum, sensitivity=b, epsilon=epsilon)
        if result < last_result:
            return b
        else:
            last_result = result
    print('No good clipping parameter found')
    return default


The goal of this project is to evaluate where and why clipping is and is not useful.  The first area of interest here is the 'num_parts' column, which has a wide range of part numbers from single digits to multiple thousands, but numbers in the high thousands are significantly rarer.  Presumably this makes the column an excellent candidate for clipping, as those higher numbers are outliers.  This is especially true because the dataset only includes sets released up to 2017, which is before LEGO began focusing more heavily on adult-aimed display sets that are typically very large (most of the largest sets ever released have come in the 2020s).

In [46]:
epsilon = 0.1
# first a mean and sum with no clipping
# sum
# using 6000 as sensitivity because no lego set with more than 6000 parts was released before 2017, but two sets came very close
noisy_sum = laplace_mech(sets['num_parts'].sum(), 6000, epsilon)
# mean
noisy_count = laplace_mech(len(sets), 1, epsilon)
noisy_mean = noisy_sum/noisy_count

# then, those same queries with clipping for three different clipping parameters
# the majority of sets can be assumed to be under 1000 parts, so try 1000
noisy_sum_1000 = laplace_mech(sets['num_parts'].clip(upper=1000).sum(), 1000, epsilon)
noisy_count_1000 = laplace_mech(len(sets['num_parts'].clip(upper=1000)), 1, epsilon)
noisy_mean_1000 = noisy_sum_1000/noisy_count_1000

# move higher, because there are still a sizable amount of sets with part numbers in the low thousands, try 3500
noisy_sum_3500 = laplace_mech(sets['num_parts'].clip(upper=3500).sum(), 3500, epsilon)
noisy_count_3500 = laplace_mech(len(sets['num_parts'].clip(upper=3500)), 1, epsilon)
noisy_mean_3500 = noisy_sum_3500/noisy_count_3500

# now use an algorithm to calculate the ideal clipping parameter
bs = range(1000, 6000, 100)
# using 6000 as the default value for the same reason it is used as the sensitivity
b = pick_b_up(bs, 'num_parts', epsilon, 6000)
noisy_sum_b = laplace_mech(sets['num_parts'].clip(upper=b).sum(), b, epsilon)
noisy_count_b = laplace_mech(len(sets['num_parts'].clip(upper=b)), 1, epsilon)
noisy_mean_b = noisy_sum_b/noisy_count_b

# lastly, a comparison of the error for the clipped and unclipped query results
no_noise_sum = sets['num_parts'].sum()
no_noise_mean = no_noise_sum/len(sets)

#no clipping error
no_clip_sum_err = pct_error(no_noise_sum, noisy_sum)
no_clip_mean_err = pct_error(no_noise_mean, noisy_mean)
print('sum without clipping error:', no_clip_sum_err)
print('mean without clipping error:', no_clip_mean_err)

#first param error
sum_err_1000 = pct_error(no_noise_sum, noisy_sum_1000)
mean_err_1000 = pct_error(no_noise_mean, noisy_mean_1000)
print('sum with clipping at 1000 error:', sum_err_1000)
print('mean with clipping at 1000 error:', mean_err_1000)

#second param error
sum_err_3500 = pct_error(no_noise_sum, noisy_sum_3500)
mean_err_3500 = pct_error(no_noise_mean, noisy_mean_3500)
print('sum with clipping at 3500 error:', sum_err_3500)
print('mean with clipping at 3500 error:', mean_err_3500)

#algorithmic param error
sum_err_b = pct_error(no_noise_sum, noisy_sum_b)
mean_err_b = pct_error(no_noise_mean, noisy_mean_b)
print('sum with alogrithmically picked clipping parameter error:', sum_err_b)
print('mean with alogrithmically picked clipping parameter error:', mean_err_b)

sum without clipping error: 1.524469122450864
mean without clipping error: 1.4593268390368328
sum with clipping at 1000 error: 9.800109537882937
mean with clipping at 1000 error: 9.813758907245422
sum with clipping at 3500 error: 1.7280947495292471
mean with clipping at 3500 error: 1.7305521876937537
sum with alogrithmically picked clipping parameter error: 6.502094384997327
mean with alogrithmically picked clipping parameter error: 6.367756682052747


Next, using the same queries as above, the usefulness of clipping for the release year of the sets will be tested.  Notably, however, the clipping parameters tested will be lower bounds, as LEGO's growth as a company has led to more sets being released year after year.  As well, none of the years can be considered outliers in the way that certain sets can be by piece count

In [47]:
# no clipping
# sum
noisy_sum = laplace_mech(sets['year'].sum(), 2017, epsilon)
# mean
noisy_count = laplace_mech(len(sets), 1, epsilon)
noisy_mean = noisy_sum/noisy_count

# trying 1999, the year LEGO first licensed Star Wars
noisy_sum_1999 = laplace_mech(sets['year'].clip(lower=1999).sum(), 18, epsilon)
noisy_count_1999 = laplace_mech(len(sets['year'].clip(lower=1999)), 1, epsilon)
noisy_mean_1999 = noisy_sum_1999/noisy_count_1999

# trying 1978, the year the minifigure was introduced
noisy_sum_1978 = laplace_mech(sets['year'].clip(lower=1978).sum(), 39, epsilon)
noisy_count_1978 = laplace_mech(len(sets['year'].clip(lower=1978)), 1, epsilon)
noisy_mean_1978 = noisy_sum_1978/noisy_count_1978

# using an algorithm
# 1950 used as a default if no good parameter can be found because it is before LEGO sets began existing in a similar fashion to how they are today
bs = range(2017, 1950, 1)
b = pick_b_down(bs, 'year', epsilon, 1950)
noisy_sum_b = laplace_mech(sets['year'].clip(upper=b).sum(), 2017-b, epsilon)
noisy_count_b = laplace_mech(len(sets['year'].clip(upper=b)), 1, epsilon)
noisy_mean_b = noisy_sum_b/noisy_count_b

# comparing the results
no_noise_sum = sets['year'].sum()
no_noise_mean = no_noise_sum/len(sets)

#no clipping error
no_clip_sum_err = pct_error(no_noise_sum, noisy_sum)
no_clip_mean_err = pct_error(no_noise_mean, noisy_mean)
print('sum without clipping error:', no_clip_sum_err)
print('mean without clipping error:', no_clip_mean_err)

#first param error
sum_err_1999 = pct_error(no_noise_sum, noisy_sum_1999)
mean_err_1999 = pct_error(no_noise_mean, noisy_mean_1999)
print('sum with clipping at 1999 error:', sum_err_1999)
print('mean with clipping at 1999 error:', mean_err_1999)

#second param error
sum_err_1978 = pct_error(no_noise_sum, noisy_sum_1978)
mean_err_1978 = pct_error(no_noise_mean, noisy_mean_1978)
print('sum with clipping at 1978 error:', sum_err_1978)
print('mean with clipping at 1978 error:', mean_err_1978)

#algorithmic param error
sum_err_b = pct_error(no_noise_sum, noisy_sum_b)
mean_err_b = pct_error(no_noise_mean, noisy_mean_b)
print('sum with alogrithmically picked clipping parameter error:', sum_err_b)
print('mean with alogrithmically picked clipping parameter error:', mean_err_b)

No good clipping parameter found
sum without clipping error: 0.2145208709936475
mean without clipping error: 0.39411206102600216
sum with clipping at 1999 error: 0.20950669507173186
mean with clipping at 1999 error: 0.03794005912748005
sum with clipping at 1978 error: 0.03083156857608581
mean with clipping at 1978 error: 0.13834319151016491
sum with alogrithmically picked clipping parameter error: 2.5906518710740967
mean with alogrithmically picked clipping parameter error: 2.576550316258654
