# Data Anonymization Tools 
## Group 8 

In [99]:
!pip install python-dp
import pydp as dp # by convention our package is to be imported as dp (dp for Differential Privacy!)
from pydp.algorithms.laplacian import BoundedSum, BoundedMean, Count, Max
import statistics 
import matplotlib.pyplot as plt
!pip install diffprivlib



In [100]:
import pandas as pd
import numpy as np 
import diffprivlib.models as dp
import numpy as np
from sklearn.naive_bayes import GaussianNB

In [101]:
df = pd.read_csv('facebook.csv')
df.head()

Unnamed: 0,userid,age,dob_day,dob_year,dob_month,gender,tenure,friend_count,friendships_initiated,likes,likes_received,mobile_likes,mobile_likes_received,www_likes,www_likes_received
0,2094382,14,19,1999,11,male,266.0,0,0,0,0,0,0,0,0
1,1192601,14,2,1999,11,female,6.0,0,0,0,0,0,0,0,0
2,2083884,14,16,1999,11,male,13.0,0,0,0,0,0,0,0,0
3,1203168,14,25,1999,12,female,93.0,0,0,0,0,0,0,0,0
4,1733186,14,4,1999,12,male,82.0,0,0,0,0,0,0,0,0


In [102]:
df.tail()

Unnamed: 0,userid,age,dob_day,dob_year,dob_month,gender,tenure,friend_count,friendships_initiated,likes,likes_received,mobile_likes,mobile_likes_received,www_likes,www_likes_received
98998,1268299,68,4,1945,4,female,541.0,2118,341,3996,18089,3505,11887,491,6202
98999,1256153,18,12,1995,3,female,21.0,1968,1720,4401,13412,4399,10592,2,2820
99000,1195943,15,10,1998,5,female,111.0,2002,1524,11959,12554,11959,11462,0,1092
99001,1468023,23,11,1990,4,female,416.0,2560,185,4506,6516,4506,5760,0,756
99002,1397896,39,15,1974,5,female,397.0,2049,768,9410,12443,9410,9530,0,2913


In [103]:
df['age'].unique()

array([ 14,  13,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  50,  49,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  63,  62,  64,
        65,  67,  66,  68,  69,  70,  71,  73,  72,  74,  76,  78,  79,
        84,  88,  90,  92,  94,  95,  97, 100, 107, 108,  93,  81, 102,
        77,  80,  75,  82, 101, 103,  85,  98, 106,  83,  87,  89,  91,
        96,  86, 112, 113, 104, 105,  99, 111, 109, 110])

In [104]:
df['dob_year'].unique()

array([1999, 2000, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990,
       1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979,
       1978, 1977, 1976, 1975, 1974, 1973, 1972, 1971, 1970, 1969, 1968,
       1967, 1966, 1965, 1963, 1964, 1962, 1961, 1960, 1959, 1958, 1957,
       1956, 1955, 1954, 1953, 1952, 1950, 1951, 1949, 1948, 1946, 1947,
       1945, 1944, 1943, 1942, 1940, 1941, 1939, 1937, 1935, 1934, 1929,
       1925, 1923, 1921, 1919, 1918, 1916, 1913, 1906, 1905, 1920, 1932,
       1911, 1936, 1933, 1938, 1931, 1912, 1910, 1928, 1915, 1907, 1930,
       1926, 1924, 1922, 1917, 1927, 1901, 1900, 1909, 1908, 1914, 1902,
       1904, 1903])

In [105]:
df.isna().sum()

userid                     0
age                        0
dob_day                    0
dob_year                   0
dob_month                  0
gender                   175
tenure                     2
friend_count               0
friendships_initiated      0
likes                      0
likes_received             0
mobile_likes               0
mobile_likes_received      0
www_likes                  0
www_likes_received         0
dtype: int64

In [106]:
df.dropna(inplace=True)

In [107]:
df.isna().sum()

userid                   0
age                      0
dob_day                  0
dob_year                 0
dob_month                0
gender                   0
tenure                   0
friend_count             0
friendships_initiated    0
likes                    0
likes_received           0
mobile_likes             0
mobile_likes_received    0
www_likes                0
www_likes_received       0
dtype: int64

## Encoding Categorical Data: Gender

In [108]:
df['gender'] = df['gender'].astype('category')
df["gender"] = df["gender"].cat.codes

In [109]:
df.head()

Unnamed: 0,userid,age,dob_day,dob_year,dob_month,gender,tenure,friend_count,friendships_initiated,likes,likes_received,mobile_likes,mobile_likes_received,www_likes,www_likes_received
0,2094382,14,19,1999,11,1,266.0,0,0,0,0,0,0,0,0
1,1192601,14,2,1999,11,0,6.0,0,0,0,0,0,0,0,0
2,2083884,14,16,1999,11,1,13.0,0,0,0,0,0,0,0,0
3,1203168,14,25,1999,12,0,93.0,0,0,0,0,0,0,0,0
4,1733186,14,4,1999,12,1,82.0,0,0,0,0,0,0,0,0


In [121]:
df.shape

(98826, 15)

In [110]:
df.columns

Index(['userid', 'age', 'dob_day', 'dob_year', 'dob_month', 'gender', 'tenure',
       'friend_count', 'friendships_initiated', 'likes', 'likes_received',
       'mobile_likes', 'mobile_likes_received', 'www_likes',
       'www_likes_received'],
      dtype='object')

In [111]:
df_clf = dp.GaussianNB(epsilon=0.2)

In [112]:
private_df = []

In [113]:
for i in df.columns:
    if i =='userid': continue
    X = df.drop(labels=[i],axis=1)
    Y = df[i]
    df_clf.fit(X,Y)
    y_pred = df_clf.predict(X)
    print(y_pred)
    private_df.append(y_pred)



[20 20 20 ... 25 18 25]
[ 1  1  1 ... 15  3  3]
[1912 1912 1912 ... 1912 1912 1912]
[6 6 6 ... 6 6 6]
[1 1 1 ... 1 1 1]
[2341. 2341. 2341. ... 2341. 2341. 2341.]
[128 128 128 ... 128 128 128]
[1365 1365 1365 ... 1365 1365 1365]
[111 111 111 ... 111 111 111]
[869 869 869 ... 869 869 869]
[427 427 427 ... 427 427 427]
[856 856 856 ... 856 856 856]
[725 725 725 ... 725 725 725]
[108 108 108 ... 108 108 108]


In [114]:
private_df

[array([20, 20, 20, ..., 25, 18, 25]),
 array([ 1,  1,  1, ..., 15,  3,  3]),
 array([1912, 1912, 1912, ..., 1912, 1912, 1912]),
 array([6, 6, 6, ..., 6, 6, 6]),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int8),
 array([2341., 2341., 2341., ..., 2341., 2341., 2341.]),
 array([128, 128, 128, ..., 128, 128, 128]),
 array([1365, 1365, 1365, ..., 1365, 1365, 1365]),
 array([111, 111, 111, ..., 111, 111, 111]),
 array([869, 869, 869, ..., 869, 869, 869]),
 array([427, 427, 427, ..., 427, 427, 427]),
 array([856, 856, 856, ..., 856, 856, 856]),
 array([725, 725, 725, ..., 725, 725, 725]),
 array([108, 108, 108, ..., 108, 108, 108])]

In [115]:
private_df=pd.DataFrame(private_df)

In [116]:
private_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98816,98817,98818,98819,98820,98821,98822,98823,98824,98825
0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,18.0,23.0,18.0,18.0,18.0,19.0,18.0,25.0,18.0,25.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,13.0,19.0,3.0,3.0,7.0,18.0,25.0,15.0,3.0,3.0
2,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,...,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0,1912.0
3,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [117]:
t2 = private_df.transpose()

In [119]:
t2.shape

(98826, 14)

In [122]:
t2.columns = ['age', 'dob_day', 'dob_year', 'dob_month', 'gender', 'tenure',
       'friend_count', 'friendships_initiated', 'likes', 'likes_received',
       'mobile_likes', 'mobile_likes_received', 'www_likes',
       'www_likes_received']

In [125]:
t2

Unnamed: 0,age,dob_day,dob_year,dob_month,gender,tenure,friend_count,friendships_initiated,likes,likes_received,mobile_likes,mobile_likes_received,www_likes,www_likes_received
0,20.0,1.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
1,20.0,1.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
2,20.0,1.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
3,20.0,1.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
4,20.0,1.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98821,19.0,18.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,236.0,427.0,856.0,725.0,108.0
98822,18.0,25.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,122.0
98823,25.0,15.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
98824,18.0,3.0,1912.0,6.0,1.0,2341.0,128.0,1365.0,111.0,869.0,427.0,856.0,725.0,108.0
