In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')

from src.d01_data.student_data_api import StudentDataApi, _block_features, _census_block_column, \
_diversity_index_features, _studentno

In [None]:
periods_list = ["1415", "1516", "1617", "1718", "1819", "1920"]
student_data_api = StudentDataApi()

df_students = student_data_api.get_data(periods_list)
mask = df_students[_census_block_column] == 'NaN'
df_students.drop(df_students.index[mask], inplace=True)

In [None]:
diversity_index_col = 'Diversity Index'
df_students[diversity_index_col] = df_students[_diversity_index_features].mean(axis=1)

block_prob = df_students.groupby(_census_block_column)[diversity_index_col].median().apply(np.exp)
block_prob = block_prob / block_prob.sum()

df_students['prob_exp'] = df_students[_census_block_column].apply(lambda x: block_prob.loc[x])

In [None]:
df_students = df_students[[_studentno, diversity_index_col, 'prob_exp', 'year', _census_block_column] + _block_features]
df_students.head()

Let's assume that there are two types of block $b$: (i) high probability of containing focal students and (ii) low probability of containing focal students. In particular, let's assume that if the `diversity_index` of the block is above the `focal_block_threshold`, then a student coming from that block has the probability `p` of being a focal student. If the block has a low probability of containing focal students, then a student coming from that block has a probability `1-p` of being a focal student.

In [None]:
df_year = df_students[['year', _census_block_column, diversity_index_col]].copy()

focal_block_threshold = 0.85
p = 0.99
def prob_focal_given_block(diversity_index):
    if diversity_index > focal_block_threshold:
        return p
    else:
        return (1. - p)
    
df_year['prob'] = df_students[diversity_index_col].apply(prob_focal_given_block)

df_year = df_year.groupby(['year'])['prob'].agg(['sum', 'count'])
df_year.columns = ['nFocalStudents', 'nTotalStudents']
df_year['pctFocalStudents'] = df_year.apply(lambda row: row['nFocalStudents'] / row['nTotalStudents'], axis=1, raw=False)
df_year

In [None]:
df_students['prob'] = df_students[diversity_index_col].apply(prob_focal_given_block)

np.random.seed(20210704)

def generate_labels(prob):
    u = np.random.rand()
    if u <= prob:
        return 1
    else:
        return 0
    
df_students['focal'] = df_students['prob'].apply(generate_labels)

df_students.groupby('year')['focal'].mean()