In [247]:
import pandas as pd
import numpy as np
import networkx as nx

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFwe, f_classif # f score

In [2]:
atom = pd.read_pickle('atom.pkl')
bond = pd.read_pickle('bond.pkl')
molecule = pd.read_pickle('molecule.pkl')

In [3]:
atom.head()

Unnamed: 0,atom_id,charge,element,molecule_id,type
0,d100_1,-0.128,c,d100,22
1,d100_10,0.132,h,d100,3
2,d100_11,0.002,c,d100,29
3,d100_12,-0.128,c,d100,22
4,d100_13,-0.128,c,d100,22


In [4]:
bond.head()

Unnamed: 0,atom1_id,atom2_id,type
0,d100_1,d100_2,7
1,d100_1,d100_7,1
2,d100_11,d100_12,7
3,d100_12,d100_13,7
4,d100_12,d100_17,1


In [5]:
molecule.head()

Unnamed: 0,ind1,inda,logp,lumo,molecule_id,mutagenic
0,1,0,4.23,-1.246,d1,yes
1,1,0,4.62,-1.387,d10,yes
2,0,0,2.68,-1.034,d100,no
3,1,0,6.26,-1.598,d101,yes
4,1,0,2.4,-3.172,d102,yes


Aim of this dataset is to determine if molecules are mutagenic.

Outline of procedure to apply RELAGG is to:
1. Determine all the pairwise id relationships used to join
2. Determine the star schema which we wish to aggregate
3. Produce a flattened dataset as needed

---

To produce the correct dataset I propose that we:

*  produce pair wise relationship `molecule_id -> atom_id -> [atom1_id, atom2_id] -> atom_id`

In [26]:
m_atom = atom[['molecule_id', 'atom_id']].to_dict(orient='records')
atom_a1 = bond[['atom1_id', 'atom2_id']].rename(columns={'atom2_id':'atom_id'}).to_dict(orient='records')
atom_a2 = bond[['atom1_id', 'atom2_id']].rename(columns={'atom1_id':'atom_id'}).to_dict(orient='records')

# produce a flatten structure with all data

In [27]:
m_atom_pairs = [("molecule_id:{}".format(x['molecule_id']), "atom_id:{}".format(x['atom_id']))
  for x in m_atom
]
atom_a2_pairs = [("atom_id:{}".format(x['atom_id']), "atom_id:{}".format(x['atom2_id']))
  for x in atom_a2
]
atom_a1_pairs = [("atom_id:{}".format(x['atom_id']), "atom_id:{}".format(x['atom1_id']))
  for x in atom_a1
]


In [174]:
all_molecule_id = list(set([x[0] for x in m_atom_pairs]))

G = nx.Graph() # we have cycles in this representation
G.add_nodes_from(all_molecule_id) # orient from entity level of molecule
G.add_nodes_from([x[1] for x in m_atom_pairs]) # add all atoms
G.add_edges_from(m_atom_pairs)
G.add_edges_from(atom_a2_pairs)
G.add_edges_from(atom_a1_pairs)

In [175]:
# To get all atoms connected to molecule we would do this:
list(nx.single_source_shortest_path_length(G, all_molecule_id[0]).keys())

['molecule_id:d88',
 'atom_id:d88_1',
 'atom_id:d88_10',
 'atom_id:d88_11',
 'atom_id:d88_12',
 'atom_id:d88_13',
 'atom_id:d88_14',
 'atom_id:d88_15',
 'atom_id:d88_16',
 'atom_id:d88_17',
 'atom_id:d88_18',
 'atom_id:d88_19',
 'atom_id:d88_2',
 'atom_id:d88_3',
 'atom_id:d88_4',
 'atom_id:d88_5',
 'atom_id:d88_6',
 'atom_id:d88_7',
 'atom_id:d88_8',
 'atom_id:d88_9']

In [176]:
def clean_molecule_atom_rship(ls):
    atoms = [x.split(':')[1] for x in ls if not x.startswith("molecule")]
    molecule = [x.split(':')[1] for x in ls if x.startswith("molecule")][0]
    return (molecule, atoms)

In [177]:
# get all relationships of interest at molecule level
molecule_level_rship = [
    clean_molecule_atom_rship(list(nx.single_source_shortest_path_length(G, x).keys())) for x in 
    all_molecule_id
]
molecule_level_rship[0]

('d88',
 ['d88_1',
  'd88_10',
  'd88_11',
  'd88_12',
  'd88_13',
  'd88_14',
  'd88_15',
  'd88_16',
  'd88_17',
  'd88_18',
  'd88_19',
  'd88_2',
  'd88_3',
  'd88_4',
  'd88_5',
  'd88_6',
  'd88_7',
  'd88_8',
  'd88_9'])

From here we will need to aggregate all this data to a single record. 

Assume columns:
*  `type` are categorical

We will only use te `atom` table as the flat entity table.

In [178]:
def dataframe_info(df, prefix='num'):
    """
    returns a dataframe with a single row, 
    where the column names as `col_{statistic}`
    """
    #df = pd.DataFrame({col: srs})
    def percentile(n):
        def percentile_(x):
            return np.percentile(x, n)
        percentile_.__name__ = 'percentile_%s' % n
        return percentile_
    df_desc = df.groupby([True]*len(df)).agg([np.sum, np.mean, np.std, np.median,
                    np.var, np.min, np.max, percentile(5), percentile(25), 
                    percentile(50), percentile(75), percentile(95)])
    df_desc.reset_index(drop=True, inplace=True)
    df_desc.columns = [('_{}_'.format(prefix)).join(x).strip() for x in df_desc.columns.values]
    return df_desc

In [214]:
def dataset_info(df, entity_name, entity_id, numeric=None, factor=None, factor_num = 50):
    """
    will flatten a dataset based on numeric and factor variables
    factor variables will change to one-hot encoding.
    """
    df_info = []
    if numeric is None:
        newdf = dataframe_info(df.select_dtypes(include=[np.number]))
    else:
        newdf = dataframe_info(df[numeric])
        
    # factor information will one hot if unique number is less than 50
    if factor is None:
        factor = []
        for col in df.columns:
            if len(df[col].unique()) <= factor_num and not col.startswith(entity_name):
                factor.append(col)
    
    newfactor = dataframe_info(pd.get_dummies(df[factor]), 'factor')
    flatten_df = pd.concat([pd.DataFrame({entity_name:[entity_id]}), 
                            newdf, newfactor], axis=1)
    return flatten_df


In [215]:
list(dataset_info(atom[atom['atom_id'].isin(molecule_level_rship[0][1])].drop('atom_id', axis=1), 
             'molecule_id', 
            molecule_level_rship[0][0]).columns)

['molecule_id',
 'charge_num_sum',
 'charge_num_mean',
 'charge_num_std',
 'charge_num_median',
 'charge_num_var',
 'charge_num_amin',
 'charge_num_amax',
 'charge_num_percentile_5',
 'charge_num_percentile_25',
 'charge_num_percentile_50',
 'charge_num_percentile_75',
 'charge_num_percentile_95',
 'type_num_sum',
 'type_num_mean',
 'type_num_std',
 'type_num_median',
 'type_num_var',
 'type_num_amin',
 'type_num_amax',
 'type_num_percentile_5',
 'type_num_percentile_25',
 'type_num_percentile_50',
 'type_num_percentile_75',
 'type_num_percentile_95',
 'charge_factor_sum',
 'charge_factor_mean',
 'charge_factor_std',
 'charge_factor_median',
 'charge_factor_var',
 'charge_factor_amin',
 'charge_factor_amax',
 'charge_factor_percentile_5',
 'charge_factor_percentile_25',
 'charge_factor_percentile_50',
 'charge_factor_percentile_75',
 'charge_factor_percentile_95',
 'type_factor_sum',
 'type_factor_mean',
 'type_factor_std',
 'type_factor_median',
 'type_factor_var',
 'type_factor_amin'

In [216]:
# construct this for all columns...
molecule_atom_flat = pd.concat([
    dataset_info(atom[atom['atom_id'].isin(x[1])].drop('atom_id', axis=1), 
             'molecule_id', 
            x[0])
    for x in molecule_level_rship
])

In [217]:
molecule_atom_flat.head()

Unnamed: 0,charge_factor_amax,charge_factor_amin,charge_factor_mean,charge_factor_median,charge_factor_percentile_25,charge_factor_percentile_5,charge_factor_percentile_50,charge_factor_percentile_75,charge_factor_percentile_95,charge_factor_std,...,type_num_mean,type_num_median,type_num_percentile_25,type_num_percentile_5,type_num_percentile_50,type_num_percentile_75,type_num_percentile_95,type_num_std,type_num_sum,type_num_var
0,0.8,-0.58,1.1686560000000002e-17,0.001,-0.129,-0.418,0.001,0.13,0.575,0.312473,...,17.736842,22.0,3.0,3.0,22.0,22.0,40.2,14.324967,337,205.204678
0,0.81,-0.39,1.1895250000000001e-17,-0.121,-0.2485,-0.39,-0.121,0.139,0.485,0.332562,...,31.642857,22.0,22.0,3.0,22.0,39.5,92.0,28.399911,443,806.554945
0,0.806,-0.394,4.625929e-18,0.005,-0.12425,-0.35365,0.005,0.07525,0.136,0.226129,...,16.458333,22.0,3.0,3.0,22.0,22.0,39.7,12.775585,395,163.21558
0,0.804,-0.396,2.9738120000000002e-18,0.005,-0.125,-0.30115,0.005,0.134,0.134,0.214389,...,16.035714,22.0,3.0,3.0,22.0,22.0,39.3,12.568279,449,157.96164
0,0.817,-0.384,-2.312965e-18,-0.114,-0.18125,-0.384,-0.114,0.146,0.817,0.366888,...,24.958333,22.0,22.0,3.0,22.0,38.5,40.0,13.763465,599,189.432971


In [238]:
modelling_dataset = molecule.merge(molecule_atom_flat)

In [240]:
X = modelling_dataset.drop(['molecule_id', 'mutagenic'], axis=1)
# fill all na with mean - there will be nas if certain classes weren't in the dataset...
#X = X.apply(lambda x: x.fillna(x.mean()),axis=0) # for mean
X = X.fillna(0)

y = modelling_dataset['mutagenic'].tolist()

In [241]:
X_m = X.as_matrix()

In [249]:
model = make_pipeline(SelectFwe(f_classif, alpha=0.05), SGDClassifier())
model = SGDClassifier(penalty='l1')
kfold = KFold(n_splits=10, shuffle=False, random_state=42)
results = cross_val_score(model, X, y, cv=kfold)
print("Accuracy: {}".format(results.mean()))

Accuracy: 0.6403508771929824
