In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('./data/Ghana-2014-DHS-Household-Filtered.csv')

df

In [None]:
# Create a unique identifier based on the combination of 'lat' and 'long'
df['lat_long'] = df['lat'].astype(str) + '_' + df['long'].astype(str)

df

In [None]:
# Map each unique 'lat_long' to a unique integer
cluster_id_mapping = {id: idx +1 for idx, id in enumerate(df['lat_long'].unique())}
df['cluster_id'] = df['lat_long'].map(cluster_id_mapping)
df

In [None]:
# Create a combined key of cluster ID and original household ID
df['cluster_household_key'] = df['cluster_id'].astype(str) + '-' + df['household_id'].astype(str)

# Map each unique 'cluster_household_key' to a new household ID starting from 1 within each cluster
household_mapping = {key: idx + 1 for idx, key in enumerate(df['cluster_household_key'].unique())}
df['new_household_id'] = df['cluster_household_key'].map(household_mapping)
df['new_household_id'] = df['cluster_id'].astype(str) + '-' + df['new_household_id'].astype(str)

# Display the dataframe to verify the new 'new_household_id' values
df.head(30)

In [None]:
# Create a combined key of new household ID and original mother ID
df['household_mother_key'] = df['new_household_id'].astype(str) + '-' + df['mother_id'].astype(str)

# Map each unique 'household_mother_key' to a new mother ID starting from 1 within each new household
mother_mapping = {key: idx + 1 for idx, key in enumerate(df['household_mother_key'].unique())}
df['new_mother_id'] = df['household_mother_key'].map(mother_mapping)
df['new_mother_id'] = df['new_household_id'].astype(str) + '-' + df['new_mother_id'].astype(str)

df.head(30)

In [None]:
# Remove the 'cluster_household_key','household_mother_key', 'lat_long' columns
df.drop(columns=['cluster_household_key', 'household_mother_key', 'lat_long'], inplace=True)

In [None]:
df

In [None]:
# Replace the 'mother_id' column with the values from 'new_mother_id'
df['mother_id'] = df['new_mother_id']

# Replace the 'household_id' column with the values from 'new_household_id'
df['household_id'] = df['new_household_id']

# remove the 'new_mother_id' and 'new_household_id' columns if they are no longer needed
df.drop(columns=['new_mother_id', 'new_household_id'], inplace=True)

df

In [None]:
# List of all columns in the DataFrame
columns = list(df.columns)

# Identify the positions for 'cluster_id', 'household_id', 'mother_id'
# We want them to be at index positions 2, 3, and 4 respectively (third, fourth, fifth columns)
desired_order = ['cluster_id', 'household_id', 'mother_id']

# Remove these columns from their current positions
columns = [col for col in columns if col not in desired_order]

# Insert them into the desired positions
for index, col in enumerate(desired_order, 2):
    columns.insert(index, col)

# Reorder the DataFrame columns
df = df[columns]

df

In [None]:

# Sort the combined DataFrame by 'cluster_id', 'household_id', 'mother_id'
df.sort_values(by=['cluster_id', 'household_id', 'mother_id'], inplace=True)

# Reset the index after sorting
df.reset_index(drop=True, inplace=True)

In [None]:
# Sample 500 urban rows
urban_sample = df[df['rural_urban'] == 'urban'].sample(n=500, random_state=1)

# Sample 500 rural rows
rural_sample = df[df['rural_urban'] == 'rural'].sample(n=500, random_state=1)

# Concatenate the two samples into a new DataFrame
sampled_df = pd.concat([urban_sample, rural_sample])

# Shuffle the combined DataFrame to mix rural and urban rows
sampled_df = sampled_df.sample(frac=1, random_state=1).reset_index(drop=True)

sampled_df

In [None]:
# Define the distance in degrees for latitude
latitude_shift = 2.5 / 111  # approximately 0.0225 degrees

# Apply random shift to latitude in the sampled dataset
sampled_df['lat'] += np.random.uniform(-latitude_shift, latitude_shift, size=len(sampled_df))

# Apply random shift to longitude in the sampled dataset, adjusted by the cosine of the latitude
sampled_df['long'] += np.random.uniform(-latitude_shift, latitude_shift, size=len(sampled_df)) / np.cos(np.radians(sampled_df['lat']))

# Round the latitude and longitude to 6 decimal places
sampled_df['lat'] = sampled_df['lat'].round(6)
sampled_df['long'] = sampled_df['long'].round(6)

sampled_df

In [None]:
# Append the sampled and adjusted DataFrame back to the original DataFrame
df = pd.concat([df, sampled_df]).reset_index(drop=True)

# Sort the combined DataFrame by 'cluster_id', 'household_id', 'mother_id'
df.sort_values(by=['cluster_id', 'household_id', 'mother_id'], inplace=True)

# Reset the index after sorting
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
df.to_csv('Ghana-2014-DHS-Household-Filtered.csv', index=False)