In [18]:
def tree_paths(tree):
    
    children_left = tree.children_left
    children_right = tree.children_right
    values = tree.value
    
    tree_paths = []
    tree_probs = []
    path = []
    path_probs = []
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()    
               
        while len(path)>parent_depth+1:
            path.pop()
            path_probs.pop()
        path.append(node_id)
        true = values[node_id][0][0].copy()
        false = values[node_id][0][1].copy()
        path_probs.append(round(true/(true+false),3))
        
        # If we have a test node
        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            tree_paths.append(path.copy())
            tree_probs.append(path_probs.copy())
            
    return tree_paths, tree_probs

In [19]:
def forest_paths(model):    
    forest_attributes = []
    baseline_means = []
    
    for treeEst in model.estimators_:
        path_thresholds = []
        path_features = []
        path_probs = []
        
        children_left = treeEst.tree_.children_left
        children_right = treeEst.tree_.children_right
        all_thresholds = treeEst.tree_.threshold
        all_features = treeEst.tree_.feature

        paths, path_probs = tree_paths(treeEst.tree_)
        
        leaf_probs = []
        for prob_list in path_probs:
                leaf_probs.append(prob_list[len(prob_list)-1])
        baseline_means.append(np.mean(leaf_probs))

        for i in range(len(paths)):
            thresholds = []
            features = []
            for j in range(len(paths[i])-1):
                if paths[i][j+1] == children_right[paths[i][j]]:
                    thresholds.append(all_thresholds[paths[i][j]])
                else:
                    thresholds.append(-all_thresholds[paths[i][j]])
                features.append(all_features[paths[i][j]])
            path_thresholds.append(thresholds.copy())
            path_features.append(features.copy())
    
        tree_atributes = pd.DataFrame([path_features, path_thresholds, path_probs]).T
        tree_atributes.columns = ['features','thresholds','path_probs']
        forest_attributes.append(tree_atributes.copy())
    return forest_attributes, np.mean(baseline_means)

In [20]:
def init_influence_list(features,conditionals,product):
    if product:
        combo_index = pd.MultiIndex.from_product([features,conditionals])
    else:
        combo_index = pd.MultiIndex.from_arrays([features,conditionals])
    
    infl_lists = []
    for i in range(combo_index.to_series().shape[0]):
        infl_lists.append([])
    
    influences = pd.Series(infl_lists,index=combo_index)
    
    return influences        

In [21]:
def get_influences(feature_combos,model):
    forest_attributes, baseline = forest_paths(model)
    
    for tree_frame in forest_attributes:
        for index, path in tree_frame.iterrows():
            previous = ['blank']
            for step in range(len(path['features'])-1):
                #what features we have seen and the feature we are on
                current_feature = path['features'][step]
                direction = np.sign(path['thresholds'][step])
                if current_feature in feature_combos.index.get_level_values(0):
                    #make relevant calculations
                    current_prob = path['path_probs'][step]
                    next_prob = path['path_probs'][step+1]
                    pct_change = (next_prob-current_prob)/current_prob
                    influence = direction*pct_change
                    #add to all relevant combos
                    for combo in feature_combos.loc[current_feature,previous]:
                        combo.append(influence)
                previous.append(current_feature*direction)
                
    influences_df = pd.DataFrame(index=feature_combos.index)
    
    for index, influences in feature_combos.iteritems():
        if influences:
            influences = np.asarray(influences)
            influences_df.loc[index,'pos_influence'] = np.mean(influences[influences>0])
            influences_df.loc[index,'neg_influence'] = np.mean(influences[influences<=0])
            influences_df.loc[index,'pct_pos'] = np.sum(influences>0)/len(influences)
            influences_df.loc[index,'occurance count'] = len(influences)
    return influences_df

In [22]:
def feature_name_index(table,features):
    
    conditional_labels = []#I know there is a better way to do this
    for condition in table.index.get_level_values(1):
        if condition == 'blank':
            conditional_labels.append(condition)
        elif condition >=0:
            conditional_labels.append("high "+features[condition])
        else:
            conditional_labels.append("low "+features[-1*condition])
    
    table_index = pd.MultiIndex.from_arrays([
        features[table.index.get_level_values(0)],conditional_labels])

    table.index = table_index
    
    return table

In [23]:
%run ../utils_rf

In [24]:
#read training data
file = "../../../tables/model_input/noc_answers.csv"
x, x_agg, y, y_agg, x_noclvl, y_noclvl = data_proccess(file,True)
x.drop(['work_num_1','work_num_2','work_num_3','work_num_4','work_num_5','work_num_6'],axis=1,inplace=True)

#grab just the noc codes to cut out test nocs from main noc table
train_nocs = pd.read_csv(file,usecols=['noc_code']).drop_duplicates()

In [8]:
rf = RandomForestClassifier(**init_params('cat'))
rf.fit(x,y['increase'])

#rf_dec = RandomForestClassifier(**init_params('cat'))
#rf_dec.fit(x,y['decrease'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
rf_features = pd.Series(rf.feature_importances_,index=x.columns)
selected_features = rf_features.sort_values(ascending=False).iloc[0:20]
selected_features_ix = np.where(np.isin(x.columns,selected_features.index))[0]
selected_features = x.columns[selected_features_ix]

In [9]:
#analsysis of non SFFS increase and decrease models
#making combo sets to check
conditionals = list(range(120))+list(range(-119,0))+['blank']
all_combos = init_influence_list(range(120),conditionals,True)

#running the analysis
influences = feature_name_index(get_influences(all_combos,rf),x.columns)
# dec_influences = feature_name_index(get_influences(all_combos,rf_dec),x.columns)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-f045fe9170b2>", line 7, in <module>
    influences = feature_name_index(get_influences(all_combos,rf),x.columns)
  File "<ipython-input-4-48b757171b47>", line 11, in get_influences
    if current_feature in feature_combos.index.get_level_values(0):
  File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\numeric.py", line 180, in __contains__
    return key in self._engine
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2033, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above ex

KeyboardInterrupt: 

In [None]:
#SFFS structual analsysis
sffs_conditionals = list(range(0,13))+list(range(-13,0))+['blank']
sffs_combos = init_influence_list(range(0,13),sffs_conditionals,True)
sffs_influences = feature_name_index(get_influences(sffs_combos,rf_sffs),x_sffs.columns)

In [18]:
non_cond_inf = influences.xs('blank',level=1)
non_cond_inf.loc[non_cond_inf['pct_pos']>0.95]

Unnamed: 0,pos_influence,neg_influence,pct_pos,occurance count
value.Persuasion,0.379083,-0.153903,0.96193,1865.0
value.Instructing,0.319096,-0.14594,0.962264,901.0
value.Service Orientation,0.298379,-0.160304,0.960057,1402.0
value.Systems Evaluation,0.341401,-0.116726,0.970628,1464.0
value.Fluency of Ideas,0.406025,-0.128103,0.98021,1617.0
value.Originality,0.3477,-0.156877,0.95612,1299.0
value.Memorization,0.303741,-0.139704,0.989091,1375.0
value.Fine Arts,0.149403,-0.064652,0.959016,366.0


In [13]:
non_cond_inf_dec = dec_influences.xs('blank',level=1)
non_cond_inf_dec.loc[non_cond_inf_dec['pct_pos']>0.8]

NameError: name 'dec_influences' is not defined

In [19]:
#getting portions instead of path counts
influences = pd.merge(influences.reset_index(),
      non_cond_inf['occurance count'].reset_index(),
      left_on=['level_0'],
      right_on=['index'],
      how='inner').set_index(['level_0','level_1']).drop('index',axis=1)

influences['occurance pct'] = influences['occurance count_x']/influences['occurance count_y']

Ok we need to check for consistency here

In [11]:
top10set = []
for i in range(10):
    rf = RandomForestClassifier(**init_params('cat')).fit(x,y['increase'])
    conditionals = list(range(120))+list(range(-119,0))+['blank']
    all_combos = init_influence_list(range(120),conditionals,True)
    influences = feature_name_index(get_influences(all_combos,rf),x.columns)
    non_cond_info = influences.xs('blank',level=1)
    top10set.append(non_cond_info.loc[non_cond_info['pct_pos']>0.95].copy())

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [148]:
top10Decset = []
for i in range(10):
    rf_dec = RandomForestClassifier(**init_params('cat')).fit(x,y['decrease'])
    conditionals = list(range(120))+list(range(-119,0))+['blank']
    all_combos = init_influence_list(range(120),conditionals,True)
    influences = feature_name_index(get_influences(all_combos,rf_dec),x.columns)
    non_cond_inf = influences.xs('blank',level=1)
    top10Decset.append(non_cond_inf.loc[non_cond_inf['pct_pos']>0.95].copy())

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [16]:
fullset = []
for bestset in top10set:
    fullset.append(bestset.index.get_values())

In [151]:
full_dec_set = []
for bestset in top10Decset:
    full_dec_set.append(bestset.index.get_values())

In [17]:
cons_set = np.unique(np.concatenate(fullset),return_counts =True)

In [153]:
cons_dec_set = np.unique(np.concatenate(full_dec_set),return_counts =True)

In [18]:
pd.DataFrame({'SAK':cons_dec_set[0],'count':cons_dec_set[1]})

NameError: name 'cons_dec_set' is not defined

In [23]:
over09 = pd.DataFrame({'SAK':cons_set[0],'count':cons_set[1]})
over09.loc[over09['count']>=5]

Unnamed: 0,SAK,count
0,value.Active Listening,8
1,value.Customer and Personal Service,8
6,value.Fine Arts,6
7,value.Fluency of Ideas,10
8,value.Installation,8
9,value.Instructing,10
12,value.Memorization,10
13,value.Number Facility,7
14,value.Originality,10
15,value.Persuasion,10


analysis of thresholds

In [14]:
paths = pd.concat(forest_paths(rf)[0])

In [15]:
thresholds = np.empty([2,0])

for index, path in paths.iterrows():
    thresholds = np.append(
        thresholds,
        np.vstack([path['features'],path['thresholds']]),
        axis=1
    )
thresholds = pd.DataFrame({'feature':thresholds[0],'threshold':thresholds[1]})

In [56]:
np.where(x.columns == 'value.Technology Design')

(array([18]),)

In [57]:
thresh_portion = pd.DataFrame(
    np.unique(abs(thresholds.loc[thresholds['feature']==18]['threshold']),return_counts = True)
).T
thresh_portion['pct']=thresh_portion[1]/np.sum(thresh_portion[1])
thresh_portion.sort_values('pct',ascending=False)

Unnamed: 0,0,1,pct
2,2.5,685.0,0.610517
0,1.5,435.0,0.387701
1,2.0,2.0,0.001783


Conditionals

In [25]:
# again we need shit to be consitent so we run 10 times and take averages
for i in range(10):
    rf = RandomForestClassifier(**init_params('cat')).fit(x,y['increase'])
    conditionals = list(range(120))+list(range(-119,0))+['blank']
    all_combos = init_influence_list(range(120),conditionals,True)
    current_infl = feature_name_index(get_influences(all_combos,rf),x.columns).fillna(0)

    pairs = current_infl[
    np.logical_not(
        np.in1d(current_infl.index.get_level_values(1), 'blank')
    )].copy()
    
    current_sig_pairs = pairs.loc[np.logical_or(pairs['pct_pos']>0.95,pairs['pct_pos']<0.05)].copy()
    current_sig_pairs['count']=1
    if i==0:
        sig_pairs = current_sig_pairs
    else:
        common_idx = sig_pairs.index.intersection(current_sig_pairs.index) #grab the pair we have seen
        for_update = sig_pairs.loc[common_idx,sig_pairs.columns != 'count'].copy()#make lists of old and new info
        update_with = current_sig_pairs.loc[common_idx,sig_pairs.columns != 'count'].copy()
        new_pairs = current_sig_pairs.loc[~current_sig_pairs.index.isin(common_idx)].copy()#new rows
        counts = sig_pairs.loc[common_idx,'count'].copy()
        updated  = (for_update.mul(counts,axis=0).add(update_with,axis=0)).div(counts+1,axis=0)
        
        sig_pairs.loc[common_idx,sig_pairs.columns != 'count'] = updated.copy()
        sig_pairs.loc[common_idx,'count'] = sig_pairs.loc[common_idx,'count']+1
        sig_pairs = pd.concat([sig_pairs,new_pairs])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [26]:
#define signficance markers based on how many times we ran everything
sig_pairs['significance'] = 'not sig'
sig_pairs.loc[sig_pairs['count']>i/4,'significance'] = '*'
sig_pairs.loc[sig_pairs['count']>2*i/4,'significance'] = '**'
sig_pairs.loc[sig_pairs['count']>3*i/4,'significance'] = '***'
sig_pairs = sig_pairs.loc[sig_pairs['significance']!='not sig']

In [49]:
law_pairs = sig_pairs.loc[sig_pairs.index.get_level_values(0).str.contains('Law')]
neg_law_pairs = law_pairs.loc[law_pairs['pct_pos']<0.05]
neg_law_pairs = neg_law_pairs.loc[neg_law_pairs.index.get_level_values(1).str.contains('low')]
neg_law_pairs.loc[np.logical_or(neg_law_pairs['significance']=='***',
                                neg_law_pairs['significance']=='**')].sort_values('neg_influence')

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_influence,neg_influence,pct_pos,occurance count,count,significance
main,conditional,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
value.Law and Government,low value.Arm-Hand Steadiness,0.0,-0.243251,0.0,4.0,9,***
value.Law and Government,low value.Education and Training,0.0,-0.19144,0.0,11.166667,6,**
value.Law and Government,low value.Technology Design,0.224463,-0.183386,0.02695,100.0,6,**
value.Law and Government,low value.Dynamic Strength,0.0,-0.174063,0.0,21.2,5,**
value.Law and Government,low value.Auditory Attention,0.0,-0.171301,0.0,18.8,5,**
value.Law and Government,low value.Administration and Management,0.0,-0.166898,0.0,16.111111,9,***
value.Law and Government,low value.Rate Control,0.070652,-0.161239,0.006667,19.833333,6,**
value.Law and Government,low value.Clerical,0.005042,-0.154846,0.009091,34.4,5,**
value.Law and Government,low value.Operation Monitoring,0.008258,-0.15042,0.004938,17.333333,9,***
value.Law and Government,low value.Operation and Control,0.0,-0.140369,0.0,15.333333,6,**


In [47]:
comp_pairs = sig_pairs.loc[sig_pairs.index.get_level_values(0).str.contains('Computers')]
neg_comp_pairs = comp_pairs.loc[comp_pairs['pct_pos']<0.05]
neg_comp_pairs = neg_comp_pairs.loc[neg_comp_pairs.index.get_level_values(1).str.contains('low')]
neg_comp_pairs.loc[np.logical_or(neg_comp_pairs['significance']=='***',
                                neg_comp_pairs['significance']=='**')].sort_values('neg_influence')

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_influence,neg_influence,pct_pos,occurance count,count,significance
main,conditional,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
value.Computers and Electronics,low value.Speech Clarity,0.0,-0.442977,0.0,18.8,5,**
value.Computers and Electronics,low value.Originality,0.011905,-0.429434,0.007843,55.2,5,**
value.Computers and Electronics,low value.Systems Evaluation,0.035805,-0.390503,0.014378,39.166667,6,**
value.Computers and Electronics,low value.Active Listening,0.038627,-0.389307,0.007576,23.0,6,**
value.Computers and Electronics,low value.History and Archeology,0.0,-0.200634,0.0,12.875,8,***
value.Computers and Electronics,low value.Communications and Media,0.0,-0.179919,0.0,5.5,6,**
value.Computers and Electronics,low value.Speed of Limb Movement,0.0,-0.16155,0.0,4.777778,9,***
value.Computers and Electronics,low value.Oral Comprehension,0.0,-0.129044,0.0,1.9,10,***
value.Computers and Electronics,low value.Science,0.0,-0.127272,0.0,7.75,8,***
value.Computers and Electronics,low value.Equipment Selection,0.0,-0.072148,0.0,3.142857,7,***


In [48]:
chem_pairs = sig_pairs.loc[sig_pairs.index.get_level_values(0).str.contains('Chemistry')]
neg_chem_pairs = chem_pairs.loc[chem_pairs['pct_pos']<0.05]
neg_chem_pairs = neg_chem_pairs.loc[neg_chem_pairs.index.get_level_values(1).str.contains('low')]
neg_chem_pairs.loc[np.logical_or(neg_chem_pairs['significance']=='***',
                                neg_chem_pairs['significance']=='**')].sort_values('neg_influence')

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_influence,neg_influence,pct_pos,occurance count,count,significance
main,conditional,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
value.Chemistry,low value.Number Facility,0.015057,-0.286982,0.005319,9.875,8,***
value.Chemistry,low value.Memorization,0.043679,-0.216699,0.005871,41.0,7,***
value.Chemistry,low value.Psychology,0.0,-0.214948,0.0,9.166667,6,**
value.Chemistry,low value.Speech Clarity,0.0,-0.088822,0.0,4.222222,9,***
value.Chemistry,low value.Deductive Reasoning,0.0,-0.070255,0.0,5.222222,9,***
value.Chemistry,low value.Written Comprehension,0.0,-0.059364,0.0,0.4,10,***
value.Chemistry,low value.Mathematical Reasoning,0.0,-0.055765,0.0,4.125,8,***
value.Chemistry,low value.Speaking,0.0,-0.051684,0.0,0.7,10,***
value.Chemistry,low value.Speech Recognition,0.0,-0.04786,0.0,2.333333,9,***
value.Chemistry,low value.Oral Comprehension,0.0,-0.032945,0.0,1.2,10,***


In [27]:
#name index levels
sig_pairs.index.set_names(['main','conditional'],inplace=True)

In [17]:
foundational = ['value.Fluency of Ideas', 'value.Memorization', 'value.Instructing', 
                'value.Persuasion','value.Service Orientation']

In [28]:
sig_pairs.to_csv('../../../tables/model_output/sig_pairs.csv')

In [6]:
sig_pairs = pd.read_csv('../../../tables/model_output/sig_pairs.csv',index_col=['main','conditional'])

Individual Pairings for in body examples

In [26]:
sig_pairs.xs('high value.Administration and Management',level=1).sort_values(['significance','pos_influence'],ascending=False)

Unnamed: 0_level_0,pos_influence,neg_influence,pct_pos,occurance count,count,significance
main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
value.Memorization,0.256990,0.000000,1.000000,18.133333,15,***
value.Service Orientation,0.234750,-0.004219,0.996825,23.333333,15,***
value.Psychology,0.177055,0.000000,1.000000,13.937500,16,***
value.Fine Arts,0.120420,0.000000,1.000000,12.944444,18,***
value.Social Perceptiveness,0.197264,0.000000,1.000000,9.769231,13,**
value.Customer and Personal Service,0.195893,0.000000,1.000000,22.916667,12,**
value.Chemistry,0.188286,-0.007407,0.995385,26.500000,10,**
value.Management of Material Resources,0.171340,0.000000,1.000000,4.900000,10,**
value.Computers and Electronics,0.164732,0.000000,1.000000,24.800000,10,**
value.Visual Color Discrimination,0.138677,0.000000,1.000000,6.400000,10,**


In [28]:
sig_pairs.xs('high value.Problem Sensitivity',level=1).sort_values(['significance','pos_influence'],ascending=False)

Unnamed: 0_level_0,pos_influence,neg_influence,pct_pos,occurance count,count,significance
main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
value.Computers and Electronics,0.249367,0.000000,1.000000,23.866667,15,***
value.Technology Design,0.123466,0.000000,1.000000,27.400000,15,***
value.Memorization,0.267263,0.000000,1.000000,35.071429,14,**
value.Service Orientation,0.248153,0.000000,1.000000,26.428571,14,**
value.Explosive Strength,0.242702,0.000000,1.000000,14.166667,12,**
value.Visualization,0.230645,0.000000,1.000000,7.700000,10,**
value.Equipment Selection,0.218067,0.000000,1.000000,17.600000,10,**
value.Instructing,0.194762,0.000000,1.000000,10.142857,14,**
value.Arm-Hand Steadiness,0.190443,0.000000,1.000000,12.916667,12,**
value.Psychology,0.189002,-0.001419,0.997222,18.166667,12,**


In [15]:
sig_pairs.xs('high value.Active Listening',level=1).sort_values(['significance','pos_influence'],ascending=False)

Unnamed: 0_level_0,pos_influence,neg_influence,pct_pos,occurance count,count,significance
main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
value.Education and Training,0.22899,0.0,1.0,11.4,15,***
value.Memorization,0.283647,-0.021542,0.997658,27.214286,14,**
value.Mechanical,0.247791,0.0,1.0,15.1,10,**
value.Service Orientation,0.245321,0.0,1.0,14.6,10,**
value.Visualization,0.236069,0.0,1.0,24.7,10,**
value.Explosive Strength,0.230288,0.0,1.0,14.272727,11,**
value.Design,0.220413,0.0,1.0,20.5,12,**
value.Chemistry,0.206293,0.0,1.0,19.769231,13,**
value.Technology Design,0.180733,0.0,1.0,22.583333,12,**
value.Auditory Attention,0.180149,0.0,1.0,12.916667,12,**


What makes knowledges more useful

In [14]:
knowledges = np.unique(pd.read_excel('../../../raw_data/ONET_data/knowledge.xlsx')['Element Name'])
knowledges = 'value.' + pd.Series(knowledges).astype(str)
knowledges = knowledges.append(pd.Series(['value.Mathematics Knowledge']))

In [18]:
#filter out non knowledges, left direction pairings
knowledge_pairs = sig_pairs.loc[np.isin(sig_pairs.index.get_level_values(0),knowledges)]
right_dirs = [i for i,item in 
              enumerate(knowledge_pairs.index.get_level_values(1)) 
              if "high" in item]
knowledge_pairs = knowledge_pairs.iloc[right_dirs]
knowledge_pairs = knowledge_pairs.loc[
    np.logical_not(np.isin(knowledge_pairs.index.get_level_values(1),
                           ('high ' + pd.Series(foundational).astype(str))))
]

In [21]:
#sort by pos_influence. grab the top 3 for each knowledge-signficance combo
knowledge_pairs = knowledge_pairs.sort_values('pos_influence',ascending=False)
knowledge_pairs.groupby(
    [knowledge_pairs.index.get_level_values(0),'significance']
).head(3).sort_values(['main','significance'],
                      ascending = False).to_csv('../../../tables/model_output/knowledge_pairs.csv')

If I have a SKA, what else should I get

In [27]:
main_skas = pd.read_csv("../../../tables/processed_dem_tables/major_group_main_SKAs.csv")
main_skas['SKA'] = ['high '+ s for s in main_skas['SKA']]
helpful_pairs = sig_pairs.loc[np.isin(sig_pairs.index.get_level_values(1),main_skas['SKA'])]
helpful_pairs = helpful_pairs.loc[
    np.logical_not(np.isin(helpful_pairs.index.get_level_values(0),foundational))
]

temp = pd.DataFrame()
for occ_group in np.unique(main_skas['dig1']):
    base_skas = main_skas.loc[main_skas['dig1']==occ_group]["SKA"]
    occ_pairs = helpful_pairs.loc[
        np.isin(
            helpful_pairs.index.get_level_values(1),base_skas
        )].copy()
    occ_pairs['occ_group'] = occ_group
    occ_pairs.reset_index(inplace=True)
    occ_pairs.set_index(['main','conditional','occ_group'],inplace=True)

    temp = temp.append(occ_pairs)

helpful_pairs = temp.loc[temp['significance']!='not sig'].sort_values('pos_influence',ascending=False).copy()
helpful_pairs.groupby(
    [helpful_pairs.index.get_level_values(2),'significance']
).head(4).sort_values(['occ_group','significance'],
                      ascending=False).to_csv('../../../tables/model_output/occ_pairs.csv')

In [15]:
# again we need shit to be consitent so we run 10 times and take averages
for i in range(10):
    rf = RandomForestClassifier(**init_params('cat')).fit(x,y['increase'])
    conditionals = list(range(120))+list(range(-119,0))+['blank']
    all_combos = init_influence_list(range(120),conditionals,True)
    current_infl = feature_name_index(get_influences(all_combos,rf),x.columns).fillna(0)

#     pairs = current_infl[
#     np.logical_not(
#         np.in1d(current_infl.index.get_level_values(1), 'blank')
#     )].copy()
    
#     current_sig_pairs = pairs.loc[pairs['pct_pos']>0.95].copy()
    current_sig_pairs = current_infl
    current_sig_pairs['count']=1
    if i==0:
        sig_pairs = current_sig_pairs
    else:
        common_idx = sig_pairs.index.intersection(current_sig_pairs.index) #grab the pair we have seen
        for_update = sig_pairs.loc[common_idx,sig_pairs.columns != 'count'].copy()#make lists of old and new info
        update_with = current_sig_pairs.loc[common_idx,sig_pairs.columns != 'count'].copy()
        new_pairs = current_sig_pairs.loc[~current_sig_pairs.index.isin(common_idx)].copy()#new rows
        counts = sig_pairs.loc[common_idx,'count'].copy()
        updated  = (for_update.mul(counts,axis=0).add(update_with,axis=0)).div(counts+1,axis=0)
        
        sig_pairs.loc[common_idx,sig_pairs.columns != 'count'] = updated.copy()
        sig_pairs.loc[common_idx,'count'] = sig_pairs.loc[common_idx,'count']+1
        sig_pairs = pd.concat([sig_pairs,new_pairs])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [16]:
sig_traits = ['value.Active Listening','value.Customer and Personal Service','value.Fine Arts',
              'value.Fluency of Ideas','value.Installation','value.Instructing',
              'value.Memorization','value.Number Facility','value.Originality',
              'value.Persuasion','value.Philosophy and Theology','value.Service Orientation',
              'value.Systems Analysis','value.Systems Evaluation','value.Technology Design',
              'value.Visualization']

In [25]:
sig_pairs.loc[np.isin(sig_pairs.index.get_level_values(0),sig_traits)].xs('blank',level=1)

Unnamed: 0_level_0,pos_influence,neg_influence,pct_pos,occurance count,count
main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
value.Active Listening,0.254047,-0.154189,0.874591,311.5,10
value.Persuasion,0.37635,-0.167471,0.975152,1623.0,10
value.Instructing,0.32498,-0.131678,0.968777,1093.3,10
value.Service Orientation,0.304465,-0.144257,0.966862,1562.6,10
value.Technology Design,0.184293,-0.192784,0.957534,1311.1,10
value.Installation,0.150923,-0.15932,0.884502,356.8,10
value.Systems Analysis,0.340178,-0.133671,0.928313,874.1,10
value.Systems Evaluation,0.341443,-0.149077,0.959226,1098.3,10
value.Fluency of Ideas,0.393847,-0.126934,0.962507,1618.9,10
value.Originality,0.344119,-0.117609,0.942734,1043.2,10


In [29]:
sig_pairs.loc[sig_pairs.index.get_level_values(0)=='value.Deductive Reasoning'].xs('blank',level=1)

Unnamed: 0_level_0,pos_influence,neg_influence,pct_pos,occurance count,count
main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
value.Deductive Reasoning,0.248708,-0.180842,0.909868,344.3,10
