#### 1. Loading the dataset

In [47]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [48]:

def load_dataset(file_path):
    return pd.read_csv(file_path)


df = load_dataset('dataset.csv')
# check the first 5 rows of the dataframe
df.head()

Unnamed: 0,area,postcode,dob,gender,ethnic_group,phone_number,marital_status,qualifications,occupation,income,home_ownership,distance_to_work_km
0,Nottingham,NG9 5SS,1989-04-18,Female,"White: English, Welsh, Scottish, Northern Iris...",447546351661,married,Level 4 (BA+) or above,9. Elementary occupations,29445,1,4.593169
1,Shropshire,SY6 0LT,1989-04-19,Female,"White: English, Welsh, Scottish, Northern Iris...",447979438189,divorced,Level 4 (BA+) or above,"1. Managers, directors and senior officials",110432,1,0.0
2,Wakefield,LS26 0UG,1982-11-04,Female,White: Roma,447950956938,never_married,Other,"8. Process, plant and machine operatives",22891,0,6.285012
3,Tunbridge Wells,TN5 2QQ,2007-12-10,Female,"White: English, Welsh, Scottish, Northern Iris...",447970878589,never_married,Level 1 (School),No occupation,4469,0,0.0
4,Walsall,WS3 1GO,1950-01-15,Male,"White: English, Welsh, Scottish, Northern Iris...",447659064207,married,Level 1 (School),4. Administrative and secretarial occupations,26406,0,17.225231


#### 2. Remove unnecessary columns ####

In [49]:
def remove_unnecessary_columns(df):
    necessary_columns = [
        'phone_number',
        'area', 
        'postcode',
        'dob',  
        'ethnic_group',
        'income',
        'home_ownership',
        'distance_to_work_km'  
    ]
    
    columns_to_drop = [col for col in df.columns if col not in necessary_columns]
    df.drop(columns_to_drop, axis=1, inplace=True)
    return df

df = remove_unnecessary_columns(df)
# check the first 5 rows of the dataframe
df.head()

Unnamed: 0,area,postcode,dob,ethnic_group,phone_number,income,home_ownership,distance_to_work_km
0,Nottingham,NG9 5SS,1989-04-18,"White: English, Welsh, Scottish, Northern Iris...",447546351661,29445,1,4.593169
1,Shropshire,SY6 0LT,1989-04-19,"White: English, Welsh, Scottish, Northern Iris...",447979438189,110432,1,0.0
2,Wakefield,LS26 0UG,1982-11-04,White: Roma,447950956938,22891,0,6.285012
3,Tunbridge Wells,TN5 2QQ,2007-12-10,"White: English, Welsh, Scottish, Northern Iris...",447970878589,4469,0,0.0
4,Walsall,WS3 1GO,1950-01-15,"White: English, Welsh, Scottish, Northern Iris...",447659064207,26406,0,17.225231


#### 3. Age, Distance-to-work  ----- Group 

In [50]:
def group_age(df, age_range):
    """dob -> age_group"""
    current_year = datetime.now().year
    df['age_group'] = pd.to_datetime(df['dob']).apply(lambda x: 
        f"{((current_year - x.year) // age_range) * age_range}-{((current_year - x.year) // age_range) * age_range + age_range - 1}")
    df.drop('dob', axis=1, inplace=True)
    return df

def group_distance(df, distance_intervals, distance_labels):
    df['distance_group'] = pd.cut(df['distance_to_work_km'], 
                                bins=distance_intervals, 
                                labels=distance_labels)
    df.drop('distance_to_work_km', axis=1, inplace=True)
    return df

def visualize_distributions(df, output_file='distribution_analysis.png'):
    plt.figure(figsize=(15, 6))
    
    # age
    plt.subplot(1, 2, 1)
    df['age_group'].value_counts().sort_index().plot(kind='bar')
    plt.title('Age Group Distribution')
    plt.xticks(rotation=45)
    
    # distance
    plt.subplot(1, 2, 2)
    df['distance_group'].value_counts().sort_index().plot(kind='bar')
    plt.title('Distance Group Distribution')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()




In [51]:

age_range = 10
distance_intervals = [-float('inf'), 0, 10, 30, float('inf')]
distance_labels = ['Work at home(0km)', '0-10km', '10-30km', '30km+']
        
df = group_age(df, age_range)
df = group_distance(df, distance_intervals, distance_labels)
        

visualize_distributions(df)
print("\nDistribution visualization saved successfully!'")


Distribution visualization saved successfully!'


#### 4. Area, Postcode ---- Generalize 

In [52]:
def generalize_postcode(df, level):
    # replace other digits with *
    df['postcode'] = df['postcode'].apply(lambda x: x[:level] + '*' * (len(x) - level))
    return df


def analyze_postcode_distribution(df, column='postcode'):
    postcode_counts = df[column].value_counts()
    print(f"Total unique areas: {len(postcode_counts)}")
    print("\nTop 3 frequent areas :")
    for area, count in postcode_counts.head(3).items():
        percentage = (count / len(df)) * 100
        print(f"{area}: {count} records ({percentage:.1f}%)")
    print("\n")
    
    # areas with less than 5 records, arounsing potential privacy risk
    small_areas = postcode_counts[postcode_counts < 5]
    if not small_areas.empty:
        print(f"Found {len(small_areas)} areas with less than 5 records")
        print(f"Showing first 3:")
        for area, count in small_areas.head(3).items():
            print(f"{area}: {count} records")
    
    return postcode_counts


In [53]:
original_df = df.copy()
print("\n------- original data analysis -------")
original_distribution = analyze_postcode_distribution(original_df)

df = generalize_postcode(df, 3)

print("\n------- generalized data analysis -------")
generalized_distribution = analyze_postcode_distribution(df)


------- original data analysis -------
Total unique areas: 99519

Top 3 frequent areas :
LE5 6CO: 3 records (0.0%)
CF83 2HH: 3 records (0.0%)
SL4 3QO: 2 records (0.0%)


Found 99519 areas with less than 5 records
Showing first 3:
LE5 6CO: 3 records
CF83 2HH: 3 records
SL4 3QO: 2 records

------- generalized data analysis -------
Total unique areas: 1221

Top 3 frequent areas :
SW1*****: 976 records (1.0%)
SE1*****: 830 records (0.8%)
NG1*****: 789 records (0.8%)


Found 2 areas with less than 5 records
Showing first 3:
LD8****: 4 records
WS6****: 4 records


#### 5. Income - Differential privacy (Not sure about it)

In [54]:
def apply_differential_privacy(df, epsilon, column):
    # how to determine the sensitivity?
    sensitivity = df[column].std() * 0.1
    scale = sensitivity / epsilon
    noise = np.random.laplace(0, scale, size=len(df))
    
    original_mean = df[column].mean()
    original_std = df[column].std()
    
    noisy_data = df[column] + noise
             
    df[column] = np.clip(noisy_data,
                        a_min=0, 
                        a_max=original_mean + 3*original_std  # limit the upper bound
                        )
    
    return df


In [55]:
dp_epsilon = 1.0
dp_column = 'income' 
original_mean = df['income'].mean()
original_std = df['income'].std()
df = apply_differential_privacy(df, epsilon=dp_epsilon, column=dp_column)
        
# check the income statistics
print("\nIncome:")
print(f"Original - Mean: {original_mean:.2f}, Std: {original_std:.2f}")
print(f"After DP - Mean: {df['income'].mean():.2f}, Std: {df['income'].std():.2f}")


Income:
Original - Mean: 47553.12, Std: 34431.57
After DP - Mean: 47504.58, Std: 33944.42
