In [113]:
# Math Libraries
import scipy
import numpy as np
import pandas as pd

# Visalisation
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
# sb.set_style('whitegrid')

# I/O
import json
import xlrd

# Machine Learning
import sklearn
import sklearn.decomposition

# Utility
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Paths

In [114]:
path_to_transformed_data = "../../../data/transformed/"
path_to_data = "../../../data/"
molecular_descriptors_file = 'data/molecular_descriptors_data.txt'

## Load Training Data

In [115]:
# Load transformed perceptual descriptors
zero_imputation = pd.read_pickle(path_to_transformed_data + "zero_imputation.zip")
mean_imputation = pd.read_pickle(path_to_transformed_data + "mean_imputation.zip")
median_imputation = pd.read_pickle(path_to_transformed_data + "median_imputation.zip")
mean_median_average_imputation = pd.read_pickle(path_to_transformed_data + "mean_median_average_imputation.zip")

In [116]:
## Difference between the mean and median impuation (based on population responses to molecules)
(mean_imputation.describe() - median_imputation.describe())

# We see below that mean for mean impuation is consistently 4 - 6 points higher than for
# the median imputation method. Note that the standard deviation is  roughly 1pt
# larger for median imputation.

Unnamed: 0,Subject # (this study),SUBJECT,Age,VIAL #,INTENSITY,PLEASANTNESS,FAMILIARITY,EDIBLE,BAKERY,SWEET,...,ACID,WARM,MUSKY,SWEATY,URINOUS,DECAYED,WOOD,GRASS,FLOWER,CHEMICAL
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,0.0,0.0,0.0,0.0,0.114232,0.12501,-0.117469,4.428274,4.576739,3.944327,...,4.702461,5.86305,5.350347,5.447488,6.777017,6.997933,5.615787,5.668768,5.927218,4.076724
std,0.0,0.0,0.0,0.0,-1.236577,-0.094378,-0.327658,-3.243828,-2.166862,-1.194934,...,-1.230766,-1.561782,-1.302912,-0.843064,-1.241255,-0.925916,-1.59415,-1.913777,-1.540004,-1.43022
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,5.259259,0.480769,2.0,9.666667,7.166667,6.5,...,6.777778,8.285714,7.565217,6.6,9.692308,8.3,7.25,8.0,8.777778,7.931034
50%,0.0,0.0,0.0,0.0,-3.717105,0.459459,-2.34127,8.0,7.8,5.375,...,6.394737,8.033333,7.375,7.454545,9.2,9.5,7.0,7.818182,9.125,5.684211
75%,0.0,0.0,0.0,0.0,-0.5,-0.452055,-1.0,-0.121212,3.928571,3.785714,...,3.916667,4.777778,5.0,5.352941,7.384615,6.777778,5.75,5.333333,6.0,2.333333
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
# We COULD start by classifying any response equal to or above the median
# as 1 and anything less as 0 - this would severely affect dataset
median_imputation.iloc[:, 15:38]

Unnamed: 0,INTENSITY,PLEASANTNESS,FAMILIARITY,EDIBLE,BAKERY,SWEET,FRUIT,FISH,GARLIC,SPICES,...,ACID,WARM,MUSKY,SWEATY,URINOUS,DECAYED,WOOD,GRASS,FLOWER,CHEMICAL
0,59.0,64.0,66.0,14.0,11.0,16.0,1.0,14.0,12.0,17.0,...,16.5,9.0,25.0,28.0,22.0,23.0,13.0,7.0,4.0,33.5
1,38.0,60.0,44.0,14.0,11.0,57.0,1.0,14.0,12.0,17.0,...,16.5,9.0,25.0,28.0,22.0,23.0,13.0,7.0,4.0,33.5
2,58.0,34.0,16.0,25.0,4.0,34.0,18.0,17.0,26.5,20.0,...,22.0,11.5,22.0,15.5,11.0,63.0,8.0,25.0,3.0,17.0
3,2.0,0.0,0.0,25.0,4.0,34.0,18.0,17.0,26.5,20.0,...,22.0,11.5,22.0,15.5,11.0,10.0,8.0,25.0,3.0,17.0
4,0.0,48.0,0.0,28.0,23.0,21.0,27.0,36.0,24.5,7.0,...,16.0,14.0,26.0,19.0,21.0,25.0,5.0,14.5,24.0,35.0
5,100.0,0.0,78.0,28.0,23.0,21.0,27.0,36.0,24.5,29.0,...,16.0,14.0,73.0,19.0,21.0,25.0,5.0,14.5,24.0,35.0
6,80.0,11.0,5.0,4.0,1.0,31.0,45.0,5.0,5.5,6.0,...,15.0,16.5,93.0,18.5,17.0,23.5,16.0,14.0,14.5,23.0
7,89.0,5.0,1.0,4.0,1.0,31.0,45.0,5.0,5.5,6.0,...,15.0,16.5,92.0,89.0,17.0,23.5,16.0,14.0,14.5,23.0
8,0.0,47.0,0.0,6.0,12.5,25.0,19.5,0.0,3.0,14.0,...,25.0,4.0,27.0,35.0,33.0,23.0,46.0,50.0,14.0,23.0
9,0.0,52.0,0.0,6.0,12.5,25.0,19.5,0.0,3.0,7.0,...,25.0,4.0,27.0,35.0,33.0,23.0,46.0,50.0,14.0,23.0


In [118]:
# Using the zero imputation dataset all response greater than 0 are 
# transformed to 1 and remaing values are all 0 - creating a 
# dichotomous matrix for our multi label classification approach.
perceptual_responses  = pd.DataFrame(zero_imputation.copy())
perceptual_responses.iloc[:, 18:38] = zero_imputation.iloc[:, 18:38].applymap(lambda x: 
                                                                              1 if x > 0 else 0)
perceptual_responses.shape

(42238, 38)

## Load Molecular Data
---
This is our X matrix (descriptive features/independant variables). We will train on and try to predict multiple target features from the above Y matrix (target features/dependant variables) as a classification task (yes/no - 1/0). 

In [119]:
molecular_descriptors = pd.read_pickle(path_to_transformed_data + "MOL_min_max_dropna.zip")

In [120]:
molecular_descriptors.shape
molecular_descriptors.head()

(476, 1009)

Unnamed: 0,CID,complexity from pubmed,MW,AMW,Sv,Se,Sp,Si,Mv,Me,...,Depressant-50,Psychotic-80,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
0,126,0.181128,0.270753,0.030587,0.262264,0.219126,0.253846,0.214989,0.216981,0.425532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,176,0.060311,0.109331,0.025411,0.096943,0.105579,0.09094,0.107335,0.125214,0.659574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,177,0.020039,0.067721,0.015501,0.075556,0.083688,0.078074,0.089782,0.106346,0.382979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,180,0.051167,0.104208,0.011542,0.121231,0.131248,0.127898,0.139362,0.099485,0.269504,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,196,0.22179,0.333247,0.023779,0.306622,0.308572,0.294339,0.305729,0.138079,0.539007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Combine Datasets on CID

In [121]:
print("Molecular Descriptors: (rows, cols) => " + molecular_descriptors.shape.__str__())
print("Perceptual Descriptors: (rows, cols) => " + perceptual_responses.shape.__str__())

Molecular Descriptors: (rows, cols) => (476, 1009)
Perceptual Descriptors: (rows, cols) => (42238, 38)


In [122]:
print("Before: ")
type(molecular_descriptors['CID'][0])
type(perceptual_responses['CID'][0])

# convert from numpy.int64 & int to string so we can sucesfully merge dataframes 
molecular_descriptors.CID = molecular_descriptors.CID.apply(str)
perceptual_responses.CID = perceptual_responses.CID.apply(str)

print("After: ")
type(molecular_descriptors['CID'][0])
type(perceptual_responses['CID'][0])

Before: 


numpy.int64

int

After: 


str

str

In [123]:
# Assign the molecular descriptors to each subject observation based on corresponding CID number
combined_descriptors = pd.merge(perceptual_responses, molecular_descriptors, on='CID')

In [180]:
# combined dataframe contains 
combined_descriptors.shape

(41650, 1046)

In [181]:
# Independant variables are our molecular descriptors (~1000  descriptors)
X_train = combined_descriptors.iloc[:, 38:].values
# Dependant variables are our perceptual descriptors (~20 smell categories)
Y_train = combined_descriptors.iloc[:, 15:18].values

    When there is no correlation between the outputs, a very simple way to solve this kind of problem is to build n independent models, i.e. one for each output, and then to use those models to independently predict each one of the n outputs. Training an inductive classifier or regression model can be a time consuming task — particularly so when training data sets are very large. When multiple models need to be trained using the same input data — but with different output data — time consumption can quickly get out of hand...
    
    More importantly, when the prediction tasks are related (i.e., there is a correlation or covariance between output values), traning a coherent multi-output model can potentially bring benefits in the form of increased predictive performance compared to training multiple disjoint models (Evgeniouand Pontil, 2004).

[cite - http://bada.hb.se/bitstream/2320/12407/1/2013MAGI04.pdf]. 

Multiplie single output models was the approach that the winning DREAM challenge teams had taken. At least one team attempted multi output regression [cite].


## Training

In [183]:
# Perform multi output random forrest classification - training on our training data

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=2, random_state=0, n_estimators=200)




In [184]:
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [198]:
# Most important descriptive feature value
print(clf.feature_importances_.max())
X_train.shape
Y_train.shape

0.005495577069020914


(41650, 1008)

(41650, 3)

## Load in our test data

In [186]:
test_zero_imputation = pd.read_pickle(path_to_transformed_data + "test_zero_imputation.zip")

# Convert to dichotomous matrix for multi label classifcation
test_zero_imputation.iloc[:, 15:38] = test_zero_imputation.iloc[:, 15:38].applymap(lambda x: 
                                                                              1 if x > 0 else 0)

In [187]:
print("Molecular Descriptors: (rows, cols) => " + molecular_descriptors.shape.__str__())
print("Test Perceptual Descriptors: (rows, cols) => " + test_zero_imputation.shape.__str__())

Molecular Descriptors: (rows, cols) => (476, 1009)
Test Perceptual Descriptors: (rows, cols) => (6762, 38)


In [188]:
type(test_zero_imputation['CID'][0])
# Convert column to string 
test_zero_imputation.CID = test_zero_imputation.CID.apply(str)
type(test_zero_imputation['CID'][0])

numpy.int64

str

In [189]:
combined_test_descriptors = pd.merge(test_zero_imputation, molecular_descriptors, on='CID')

In [199]:
X_test = combined_test_descriptors.iloc[:, 38:].values
Y_test = combined_test_descriptors.iloc[:, 15:18].values
X_test.shape
Y_test.shape
Y_predictions.shape

(6762, 1008)

(6762, 3)

(6762, 3)

In [194]:
Y_predictions = clf.predict(X_test)

In [200]:
from sklearn.metrics import f1_score

f1_score(Y_test, Y_predictions, average=None)

  'precision', 'predicted', average, warn_for)


array([0., 0., 0.])

In [None]:
semantic_labels = list(zero_imputation.iloc[:, 18:38].keys())

In [202]:
# Only making predictions for sweet
pd.DataFrame(Y_predictions).sum()
pd.DataFrame(Y_test).sum()
pd.DataFrame(Y_train, columns=semantic_labels).sum()

0    0.0
1    0.0
2    0.0
dtype: float64

0    4912
1    4898
2    4756
dtype: int64

ValueError: Shape of passed values is (3, 41650), indices imply (20, 41650)