In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
def load_data_agg(path):
    data = pd.read_csv(path, index_col=['noc','workshop.number'])
    data.sort_index(inplace=True)
    data.loc[data.share == 'remain constant','share'] = 'constant'
    data.dropna(inplace=True)
    return data

In [3]:
def get_x_agg(data):
    x_cont = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) # making x data frame
    x_cont['work_num'] = x_cont.index.get_level_values(1) # making workshop number a variable as well as an index
    x_cont.drop_duplicates(inplace=True)
    # Assumption we should be aware of here
    x_disc = np.round(x_cont).astype(int) # round x to make discrete
    return x_cont, x_disc

In [4]:
def get_y_abs_agg(data):
    y_abs = pd.DataFrame(data['absolute']).pivot_table(index = ['noc','workshop.number'], columns = 'absolute', aggfunc = len).fillna(0)
    y_abs['sum'] = y_abs.sum(axis = 1)
    y_abs['not_increase'] = y_abs['fewer'] + y_abs['same']
    y_abs.loc[:,y_abs.columns!='sum'] = y_abs.loc[:,y_abs.columns!='sum'].divide(y_abs['sum'],axis=0)
    y_abs['y'] = y_abs[['fewer','more','same']].idxmax(axis=1)
    y_abs['binned_y'] = y_abs[['more','not_increase']].idxmax(axis=1)
    return y_abs

In [5]:
def get_y_share_agg(data):
    y_share = pd.DataFrame(data['share']).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
    y_share['sum'] = y_share.sum(axis = 1)
    y_share['not_increase'] = y_share['decrease'] + y_share['constant']
    y_share.loc[:,y_share.columns!='sum'] = y_share.loc[:,y_share.columns!='sum'].divide(y_share['sum'],axis=0)
    y_share['y'] = y_share[['constant','decrease','increase']].idxmax(axis=1)
    y_share['binned_y'] = y_share[['increase','not_increase']].idxmax(axis=1)
    return y_share

In [6]:
def load_data_ind(path):
    data = pd.read_csv(path)
    data.loc[data.share == 'remain constant','share'] = 'constant'
    data.dropna(inplace=True)
    return data

In [7]:
def get_nocs(data):
    return data['noc'].values

In [8]:
def get_x_ind(data):
    x_cont = data.drop(['absolute','share','Unnamed: 0','noc_code', 'noc'],axis=1) # making x data frame
    # Assumption we should be aware of here
    x_disc = np.round(x_cont).astype(int) # round x to make discrete
    return x_cont, x_disc

In [9]:
def get_y_ind(data):
    y_abs = data['absolute'].values
    y_abs_bin = data['absolute'].replace(['same', 'fewer'], ['not_increase', 'not_increase']).values
    y_share = data['share'].values
    y_share_bin = data['share'].replace(['decrease', 'constant'], ['not_increase', 'not_increase']).values
    return y_abs, y_abs_bin, y_share, y_share_bin

In [10]:
def run_random_search(estimator, params_grid, x, y):
    random_search = RandomizedSearchCV(estimator=estimator, 
                                       param_distributions=params_grid, 
                                       n_iter=100, 
                                       cv=5,
                                       iid=False,
                                       n_jobs=-1,
                                       verbose=1)
    random_search.fit(x, y)
    return random_search

In [11]:
def run_grid_search(estimator, params_grid, x, y):
    grid_search = GridSearchCV(estimator=estimator, 
                               param_grid=params_grid, 
                               cv=5,
                               iid=False,
                               n_jobs=-1,
                               verbose=1)
    grid_search.fit(x, y)
    return grid_search

In [35]:
def create_noc_dict():
    noc_dict = {'cont': {}, 'disc': {}}
    
    for dist in noc_dict:
        if dist == 'cont': xs = x_cont_agg
        if dist == 'disc': xs = x_disc_agg
        for index, x_row in enumerate(xs.iterrows()):
            y_row = y_share_agg.iloc()[index]
            noc_dict[dist][tuple(x_row[1])] = {'noc':          x_row[0][0],
                                             # 'abs':          (y_row['more'], 
                                             #                  y_row['same'], 
                                             #                  y_row['fewer']),
                                             # 'abs_binned':   (y_row['more'], 
                                             #                  y_row['not_increase'])
                                               'share':        (y_row['increase'], 
                                                                y_row['constant'], 
                                                                y_row['decrease']),
                                               'share_binned': (y_row['increase'], 
                                                                y_row['not_increase'])}
    return noc_dict

In [36]:
# get aggregated data

data = load_data_agg('../tables/noc_answers.csv')
x_cont_agg, x_disc_agg = get_x_agg(data)
y_abs_agg = get_y_abs_agg(data)
y_share_agg = get_y_share_agg(data)

# get individial data
data = load_data_ind('../tables/noc_answers.csv')
nocs = get_nocs(data)  
x_cont_ind, x_disc_ind = get_x_ind(data)
y_abs_ind, y_abs_bin_ind, y_share_ind, y_share_bin_ind = get_y_ind(data)

# create noc dict
noc_dict = create_noc_dict()

In [55]:
print(len(next(x_disc_ind.iterrows())[1]))
print(len(next(x_disc_agg.iterrows())[1]))

121
121


In [56]:
x_disc_ind.columns

Index(['workshop.number', 'value.Reading Comprehension',
       'value.Active Listening', 'value.Writing', 'value.Speaking',
       'value.Mathematics Skill', 'value.Science', 'value.Critical Thinking',
       'value.Active Learning', 'value.Learning Strategies',
       ...
       'value.English Language', 'value.Foreign Language', 'value.Fine Arts',
       'value.History and Archeology', 'value.Philosophy and Theology',
       'value.Public Safety and Security', 'value.Law and Government',
       'value.Telecommunications', 'value.Communications and Media',
       'value.Transportation'],
      dtype='object', length=121)

In [58]:
x_disc_agg.columns

Index(['value.Reading Comprehension', 'value.Active Listening',
       'value.Writing', 'value.Speaking', 'value.Mathematics Skill',
       'value.Science', 'value.Critical Thinking', 'value.Active Learning',
       'value.Learning Strategies', 'value.Monitoring',
       ...
       'value.Foreign Language', 'value.Fine Arts',
       'value.History and Archeology', 'value.Philosophy and Theology',
       'value.Public Safety and Security', 'value.Law and Government',
       'value.Telecommunications', 'value.Communications and Media',
       'value.Transportation', 'work_num'],
      dtype='object', length=121)

In [17]:
df = pd.read_csv('../tables/noc_answers.csv')
nan_rows = df[df.isnull().any(1)]
nan_rows

Unnamed: 0.1,Unnamed: 0,noc_code,workshop.number,noc,absolute,share,value.Reading Comprehension,value.Active Listening,value.Writing,value.Speaking,...,value.English Language,value.Foreign Language,value.Fine Arts,value.History and Archeology,value.Philosophy and Theology,value.Public Safety and Security,value.Law and Government,value.Telecommunications,value.Communications and Media,value.Transportation
1293,1294,6322,5,Cooks,,increase,3.12,3.0,2.75,3.12,...,2.8,1.45,1.11,1.12,1.51,2.19,1.39,1.72,1.72,1.36


In [18]:
y_abs_agg.head()

Unnamed: 0_level_0,absolute,fewer,more,same,sum,not_increase,y,binned_y
noc,workshop.number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accommodation service managers,4,0.157895,0.631579,0.210526,19.0,0.368421,more,more
Accounting technicians and bookkeepers,4,0.578947,0.263158,0.157895,19.0,0.736842,fewer,not_increase
"Air pilots, flight engineers and flying instructors",3,0.1,0.65,0.25,20.0,0.35,more,more
Air transport ramp attendants,3,0.35,0.4,0.25,20.0,0.6,more,not_increase
Airline ticket and service agents,3,0.75,0.1,0.15,20.0,0.9,fewer,not_increase


In [19]:
y_share_agg.head()

Unnamed: 0_level_0,share,constant,decrease,increase,sum,not_increase,y,binned_y
noc,workshop.number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accommodation service managers,4,0.473684,0.105263,0.421053,19.0,0.578947,constant,not_increase
Accounting technicians and bookkeepers,4,0.368421,0.473684,0.157895,19.0,0.842105,decrease,not_increase
"Air pilots, flight engineers and flying instructors",3,0.5,0.1,0.4,20.0,0.6,constant,not_increase
Air transport ramp attendants,3,0.25,0.45,0.3,20.0,0.7,decrease,not_increase
Airline ticket and service agents,3,0.25,0.7,0.05,20.0,0.95,decrease,not_increase


In [20]:
y_abs_ind[:5]

array(['same', 'same', 'same', 'fewer', 'fewer'], dtype=object)