In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
def load_data_agg(path):
    data = pd.read_csv(path, index_col=['noc','workshop.number'])
    data.sort_index(inplace=True)
    data.loc[data.share == 'remain constant','share'] = 'constant'
    data.dropna(inplace=True)
    return data

In [3]:
def get_x_agg(data):
    x_cont = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) # making x data frame
    x_cont['work_num'] = x_cont.index.get_level_values(1) # making workshop number a variable as well as an index
    x_cont.drop_duplicates(inplace=True)
    # Assumption we should be aware of here
    x_disc = np.round(x_cont).astype(int) # round x to make discrete
    return x_cont, x_disc

In [4]:
def get_y_abs_agg(data):
    y_abs = pd.DataFrame(data['absolute']).pivot_table(index = ['noc','workshop.number'], columns = 'absolute', aggfunc = len).fillna(0)
    y_abs['sum'] = y_abs.sum(axis = 1)
    y_abs['not_increase'] = y_abs['fewer'] + y_abs['same']
    y_abs.loc[:,y_abs.columns!='sum'] = y_abs.loc[:,y_abs.columns!='sum'].divide(y_abs['sum'],axis=0)
    y_abs['y'] = y_abs[['fewer','more','same']].idxmax(axis=1)
    y_abs['binned_y'] = y_abs[['more','not_increase']].idxmax(axis=1)
    return y_abs

In [5]:
def get_y_share_agg(data):
    y_share = pd.DataFrame(data['share']).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
    y_share['sum'] = y_share.sum(axis = 1)
    y_share['not_increase'] = y_share['decrease'] + y_share['constant']
    y_share.loc[:,y_share.columns!='sum'] = y_share.loc[:,y_share.columns!='sum'].divide(y_share['sum'],axis=0)
    y_share['y'] = y_share[['constant','decrease','increase']].idxmax(axis=1)
    y_share['binned_y'] = y_share[['increase','not_increase']].idxmax(axis=1)
    return y_share

In [6]:
def load_data_ind(path):
    data = pd.read_csv(path)
    data.loc[data.share == 'remain constant','share'] = 'constant'
    data.dropna(inplace=True)
    return data

In [7]:
def get_nocs(data):
    return data['noc'].values

In [27]:
def get_x_ind(data):
    nocs = data['noc']
    x_cont = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) # making x data frame
    # Assumption we should be aware of here
    x_disc = x_cont.drop(['noc'], axis=1)
    x_disc = np.round(x_disc).astype(int) # round x to make discrete
    x_disc.insert(loc=1, column='noc', value=nocs)
    return x_cont, x_disc

In [9]:
def get_y_ind(data):
    y_abs = data['absolute'].values
    y_abs_bin = data['absolute'].replace(['same', 'fewer'], ['not_increase', 'not_increase']).values
    y_share = data['share'].values
    y_share_bin = data['share'].replace(['decrease', 'constant'], ['not_increase', 'not_increase']).values
    return y_abs, y_abs_bin, y_share, y_share_bin

In [10]:
def run_random_search(estimator, params_grid, x, y):
    random_search = RandomizedSearchCV(estimator=estimator, 
                                       param_distributions=params_grid, 
                                       n_iter=100, 
                                       cv=5,
                                       iid=False,
                                       n_jobs=-1,
                                       verbose=1)
    random_search.fit(x, y)
    return random_search

In [11]:
def run_grid_search(estimator, params_grid, x, y):
    grid_search = GridSearchCV(estimator=estimator, 
                               param_grid=params_grid, 
                               cv=5,
                               iid=False,
                               n_jobs=-1,
                               verbose=1)
    grid_search.fit(x, y)
    return grid_search

In [12]:
def create_noc_dict():
    noc_dict = {'cont': {}, 'disc': {}}
    
    for dist in noc_dict:
        if dist == 'cont': xs = x_cont_agg
        if dist == 'disc': xs = x_disc_agg
        for index, x_row in enumerate(xs.iterrows()):
            y_row = y_share_agg.iloc()[index]
            noc_dict[dist][tuple(x_row[1])] = {'noc':          x_row[0][0],
                                             # 'abs':          (y_row['more'], 
                                             #                  y_row['same'], 
                                             #                  y_row['fewer']),
                                             # 'abs_binned':   (y_row['more'], 
                                             #                  y_row['not_increase'])
                                               'share':        (y_row['increase'], 
                                                                y_row['constant'], 
                                                                y_row['decrease']),
                                               'share_binned': (y_row['increase'], 
                                                                y_row['not_increase'])}
    return noc_dict

In [28]:
# get aggregated data

data = load_data_agg('../tables/noc_answers.csv')
x_cont_agg, x_disc_agg = get_x_agg(data)
# y_abs_agg = get_y_abs_agg(data)
y_share_agg = get_y_share_agg(data)

# get individial data
data = load_data_ind('../tables/noc_answers_participants.csv')
nocs = get_nocs(data)  
x_cont_ind, x_disc_ind = get_x_ind(data)
y_abs_ind, y_abs_bin_ind, y_share_ind, y_share_bin_ind = get_y_ind(data)

# create noc dict
noc_dict = create_noc_dict()

## Scratchpad

In [30]:
x_cont_ind.head()

Unnamed: 0,workshop.number,noc,participant.id,value.Reading Comprehension,value.Active Listening,value.Writing,value.Speaking,value.Mathematics Skill,value.Science,value.Critical Thinking,...,value.English Language,value.Foreign Language,value.Fine Arts,value.History and Archeology,value.Philosophy and Theology,value.Public Safety and Security,value.Law and Government,value.Telecommunications,value.Communications and Media,value.Transportation
0,6,"Senior managers - financial, communications an...",1,4.0,4.0,3.71,4.126667,2.793333,1.836667,4.086667,...,3.863333,1.526667,1.323333,1.496667,1.686667,2.933333,3.25,1.946667,2.71,2.276667
1,6,"Senior managers - financial, communications an...",3,4.0,4.0,3.71,4.126667,2.793333,1.836667,4.086667,...,3.863333,1.526667,1.323333,1.496667,1.686667,2.933333,3.25,1.946667,2.71,2.276667
2,6,"Senior managers - financial, communications an...",4,4.0,4.0,3.71,4.126667,2.793333,1.836667,4.086667,...,3.863333,1.526667,1.323333,1.496667,1.686667,2.933333,3.25,1.946667,2.71,2.276667
3,6,"Senior managers - financial, communications an...",5,4.0,4.0,3.71,4.126667,2.793333,1.836667,4.086667,...,3.863333,1.526667,1.323333,1.496667,1.686667,2.933333,3.25,1.946667,2.71,2.276667
4,6,"Senior managers - financial, communications an...",6,4.0,4.0,3.71,4.126667,2.793333,1.836667,4.086667,...,3.863333,1.526667,1.323333,1.496667,1.686667,2.933333,3.25,1.946667,2.71,2.276667


In [25]:
y_abs_ind[:5]

array(['same', 'same', 'same', 'fewer', 'fewer'], dtype=object)