<a href="https://colab.research.google.com/github/ChahraBena/Security-For-Big-Data/blob/main/GRR_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#We will use data of different regions, containing gender, names, etc. anonymize them and extract useful statistics as a proof of concept
#importing libraries
import csv 
import pandas as pd
import random
import numpy as np
import math

In [None]:
#Let's get the data
data = pd.read_csv("FIMU_2017_Vhs.csv")
data

Unnamed: 0,Person ID,Name,Gender,Age,Geolife,Visitor category,Region,Sleeping area
0,7645,Andrea Berry,NR,NR,NR,Foreign tourist,United Kingdom,City of Belfort
1,26771,Elizabeth Palmer,NR,NR,NR,Foreign tourist,Germany,NR
2,38302,Jacob Smith,NR,NR,NR,Foreign tourist,Netherlands,City of Belfort
3,28106,Francisco Blankenship,NR,NR,NR,Foreign tourist,United Kingdom,NR
4,72234,Stephanie Bell DDS,NR,NR,NR,Foreign tourist,United Kingdom,NR
...,...,...,...,...,...,...,...,...
88930,6998,Anastasie Begue,F,55-64,NR,French tourist,Rhône-Alpes,NR
88931,10589,Arthur Maillet5778,M,<18,rural worker,French tourist,Rhône-Alpes,Rest of Doubs
88932,10275,Arnaude du Leleu,F,>65,middle-class urban,French tourist,Rhône-Alpes,NR
88933,19080,Christiane Roy,F,55-64,middle-class urban,French tourist,Rhône-Alpes,City of Belfort


In [None]:
#selecting region 
selected_attribute = data.Region
selected_attribute

0        United Kingdom
1               Germany
2           Netherlands
3        United Kingdom
4        United Kingdom
              ...      
88930       Rhône-Alpes
88931       Rhône-Alpes
88932       Rhône-Alpes
88933       Rhône-Alpes
88934       Rhône-Alpes
Name: Region, Length: 88935, dtype: object

In [None]:
#this function is the one that applies GRR to data given eps and data
def GRR(data_to_estimate,eps):
  selected_attribute = data_to_estimate
  #get regions in total 
  regions = selected_attribute.unique()
  #get number of regions
  nb_regions = len(regions)
  d = nb_regions
  p = (math.exp(eps)/(math.exp(eps)+d-1)) #we define p and q according to the formula
  q = 1/(math.exp(eps)+d-1)
  p_dataset = [] #we will generate the estimated frequency of each category
  def select_element(x):
    coin = random.random() #we generate random coins
    if coin<=p:
      p_dataset.append(x) #if <=p then we add the true value
    else:
      other_regions = [r for r in regions if r != x] #we generate a list that excludes that region
      p_dataset.append(random.choice(other_regions)) # we choose randomly from regions excluding the current one
  selected_attribute.apply(lambda x: select_element(x))
  results = pd.DataFrame(p_dataset)
  results.columns=['Frequency']
  return results,p,q

In [None]:
#this function calculates rmse between real values and estimated ones
def rmse(estimation, real):
    return np.sqrt(((estimation - real) ** 2).mean())

In [None]:
#let's generate firstly the true frequency of each region 
true_frequency = pd.DataFrame(selected_attribute.value_counts())
true_frequency = true_frequency.sort_index()
true_frequency.columns=['Frequency']
true_frequency

Unnamed: 0,Frequency
AUTRE 97,45
Alsace,6290
Aquitaine,279
Asia Oceania,52
Auvergne,156
Basse-Normandie,702
Belgium + Luxembourg,36
Bourgogne,1395
Bretagne,231
Centre,422


In [None]:
#lets test our function 
#1st with eps = 0.1
eps1 = 0.1
results_eps1,p,q = GRR(selected_attribute,eps1)
print("p= {} and q= {}".format(p,q))
frequency_eps1=pd.DataFrame(results_eps1.value_counts())
frequency_eps1 = frequency_eps1.sort_index()
frequency_eps1

p= 0.029784822188684966 and q= 0.02695042160586986


Unnamed: 0_level_0,0
Frequency,Unnamed: 1_level_1
AUTRE 97,2392
Alsace,2359
Aquitaine,2460
Asia Oceania,2434
Auvergne,2321
Basse-Normandie,2379
Belgium + Luxembourg,2427
Bourgogne,2349
Bretagne,2336
Centre,2387


In [None]:
#lets calculate rmse 
estimations = frequency_eps1.to_numpy()
true = true_frequency.to_numpy()
rmse_eps1 = rmse(estimations, true)
print(rmse_eps1)

10190.544186195888


In [None]:
#2nd with eps = 1
eps2 = 1
results_eps2,p,q = GRR(selected_attribute,eps2)
print("p= {} and q= {}".format(p,q))
frequency_eps2=pd.DataFrame(results_eps2.value_counts())
frequency_eps2 = frequency_eps2.sort_index()
frequency_eps2

p= 0.07020667498889452 and q= 0.025827592361419596


Unnamed: 0_level_0,0
Frequency,Unnamed: 1_level_1
AUTRE 97,2232
Alsace,2596
Aquitaine,2348
Asia Oceania,2231
Auvergne,2319
Basse-Normandie,2371
Belgium + Luxembourg,2218
Bourgogne,2380
Bretagne,2358
Centre,2331


In [None]:
#lets calculate rmse 
estimations2 = frequency_eps2.to_numpy()
rmse_eps2 = rmse(estimations2, true)
print(rmse_eps2)

9773.759604717665


In [None]:
#2nd with eps = 10
eps3 = 10
results_eps3,p,q = GRR(selected_attribute,eps3)
print("p= {} and q= {}".format(p,q))
frequency_eps3=pd.DataFrame(results_eps3.value_counts())
frequency_eps3 = frequency_eps3.sort_index()
frequency_eps3

p= 0.9983682694248767 and q= 4.532584930898295e-05


Unnamed: 0_level_0,0
Frequency,Unnamed: 1_level_1
AUTRE 97,48
Alsace,6288
Aquitaine,282
Asia Oceania,58
Auvergne,163
Basse-Normandie,704
Belgium + Luxembourg,38
Bourgogne,1399
Bretagne,235
Centre,425


In [None]:
#lets calculate rmse 
estimations3 = frequency_eps3.to_numpy()
rmse_eps3 = rmse(estimations3, true)
print(rmse_eps3)

15.190679930692923
