# Coding Task

## Table of Contents
- Generating the Dataset
    - Generating the demographic information
    - Generating the attitudes towards vaccination
    - Generating the underlying social network
    - Generating the treatment status
    - Generating the outcome variable
- Conducting the Analysis

### Importing Needed libraries
This notebook assumes that the working directory is the root of the project folder. Otherwise, change the directory using `os.chdir()`.

In [3]:
# Importing libraries
import numpy as np
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt


# Set random seed for reproducibility
np.random.seed(111)

### Generating Demographic Information

In [None]:
# Initialize an empty dataframe with 5000 rows
n = 5000 # number of samples to generate
df = pd.DataFrame(index=range(n))

# Generate demographic data
df['demographic_age'] = np.round(np.random.uniform(21,80,n)).astype('int')
df['demographic_income'] = (1 - np.random.power(2.5,n)) * 5000000
df['demographic_education'] = np.round(np.random.uniform(0,3,n)).astype('int')
df['demographic_unobs_grp'] = np.random.choice(['A','B','C','D','E'],n)

demographic_cont_vars = ['demographic_age','demographic_income','demographic_education']
demographic_cat_vars = 'demographic_unobs_grp'

### Generating Attitudes Towards Vaccination

In [None]:
def generate_attitudes(df):
    # Normalize continuous variables
    this_df = df.copy(deep=True)
    this_df[demographic_cont_vars] = this_df[demographic_cont_vars].apply(lambda x: x/x.max(), axis=0)

    # Dummy categorical variables
    this_df = pd.get_dummies(this_df,columns=[demographic_cat_vars],drop_first=False)

    # Generate attitudes
    global attitudes
    attitudes = ['att_covid','att_vaccine','att_safety','att_unobserved']
    demographics = list(this_df.columns)

    for attitude in attitudes:
        df[attitude] = np.random.choice(range(4,7),n) + np.random.normal(0,1,n)
        for demographic in demographics: # Add influence of demographics
            df[attitude] = df[attitude] + np.random.uniform(-0.25,0.25) * this_df[demographic]

        # Handle values that are outside of max range (1-10)
        df[attitude] = df[attitude].clip(1,10)

        # Round
        df[attitude] = np.round(df[attitude]).astype('int')
    return df

df = generate_attitudes(df)
print(df.shape)
df.head(1)

(5000, 8)


Unnamed: 0,demographic_age,demographic_income,demographic_education,demographic_unobs_grp,att_covid,att_vaccine,att_safety,att_unobserved
0,77,1136473.0,2,A,8,6,6,5


### Generating Network Structure

In [None]:
# Implementing homophily



def bb_network(N, m):
    # 1. Start with a clique of m+1 nodes
    G = nx.complete_graph(m + 1)

    all_fitness = np.random.random(N) # generate 200 fitness values
    for i in range(G.number_of_nodes(), N):
        # 2. Select m different nodes at random, weighted by their fitness.
        new_neighbors = []
        possible_neighbors = list(G.nodes)
        weight = [all_fitness[n] * G.degree(n) for n in possible_neighbors]
        p = np.array(weight)/np.sum(np.array(weight))
        new_neighbors = np.random.choice(possible_neighbors, size=m, replace=False, p=p)

        # 3. Add a new node i and link it with the m nodes from the previous step.
        for j in new_neighbors:
            G.add_edge(i, j)

    return G

G = bb_network(5000,5)