In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import itertools

In [2]:
#Turn off warnings for a more plain notebook
import warnings
warnings.filterwarnings("ignore")

* Read the data and organize column names

In [3]:
df = pd.read_csv('Musk1.csv',header=None)
df.columns = ['Feature '+str(x-1) for x in df.columns]
df.rename(columns={'Feature -1':'Bag class','Feature 0':'Bag Id'},inplace=True)
print(df.shape)
df.head()

(476, 168)


Unnamed: 0,Bag class,Bag Id,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,...,Feature 157,Feature 158,Feature 159,Feature 160,Feature 161,Feature 162,Feature 163,Feature 164,Feature 165,Feature 166
0,1,1,42,-198,-109,-75,-117,11,23,-88,...,-238,-74,-129,-120,-38,30,48,-37,6,30
1,1,1,42,-191,-142,-65,-117,55,49,-170,...,-238,-302,60,-120,-39,31,48,-37,5,30
2,1,1,42,-191,-142,-75,-117,11,49,-161,...,-238,-73,-127,-120,-38,30,48,-37,5,31
3,1,1,42,-198,-110,-65,-117,55,23,-95,...,-238,-302,60,-120,-39,30,48,-37,6,30
4,1,2,42,-198,-102,-75,-117,10,24,-87,...,-238,-73,-127,51,128,144,43,-30,14,26


* Check positive ratio

Note that it is said that if at least one instance in a bag is positive then the bag has a positive label. So I take the maximum of Bag Class

In [4]:
target_df = df.groupby('Bag Id').agg({'Bag class':'max'})
target_df.value_counts()

Bag class
1            47
0            45
dtype: int64

* Check if there exist null values

In [5]:
df.isnull().any()[df.isnull().any()==True]

Series([], dtype: bool)

* Check the distribution

In [6]:
df.groupby('Bag Id').describe()

Unnamed: 0_level_0,Bag class,Bag class,Bag class,Bag class,Bag class,Bag class,Bag class,Bag class,Feature 1,Feature 1,...,Feature 165,Feature 165,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Bag Id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,4.0,42.000000,...,6.00,6.0,4.0,30.250000,0.500000,30.0,30.00,30.0,30.25,31.0
2,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,4.0,42.000000,...,14.00,14.0,4.0,26.500000,1.732051,25.0,25.75,26.0,26.75,29.0
3,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,42.000000,...,7.50,13.0,2.0,16.500000,58.689863,-25.0,-4.25,16.5,37.25,58.0
4,3.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,42.666667,...,7.00,7.0,3.0,32.333333,5.507571,27.0,29.50,32.0,35.00,38.0
5,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,4.0,61.500000,...,2.00,14.0,4.0,36.750000,66.339405,-27.0,-12.75,29.0,78.50,116.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,4.937500,...,30.00,37.0,16.0,32.875000,79.796303,-72.0,-22.25,23.5,120.25,124.0
89,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,23.500000,...,-141.00,-141.0,4.0,65.000000,13.904436,52.0,53.50,65.0,76.50,78.0
90,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,24.500000,...,-144.25,-142.0,4.0,65.500000,11.387127,52.0,59.50,65.5,71.50,79.0
91,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,38.000000,...,-25.50,-10.0,3.0,12.000000,79.774683,-38.0,-34.00,-30.0,37.00,104.0


### Task

* In this task we are asked to suggest two alternative bag-level represenations. I will use 3 alternatives:
    * Alternative 1: As a very simple approach, I will represent bags with statistics of features (count, mean, standard deviation, maximum, minimum and percentiles). Although this is a simple approach and it is mentioned that we should not consider just taking the mean of features as an alternative, I wonder this approach's performance comparing to other alternatives. Here no parameters are required. 
    * Alternative 2: k-means two class-encoding studied in [1] because it is mentioned that BoW representations are successfull and this is an extended version to them. I will tune k as a hyper-parameter.
    * Alternative 3: Random tree (RT) encoding with terminal node representation explained in [1] because it is mentioned as robust, fast and producing competitive results. I just made a small modification to simplify the coding. When constructing the random tree, I will use a single randomly selected feature in a layer. Therefore, depth of the tree means the number of different features used in the tree.  I will tune depth of the tree and number of trees as hyper-parameters.
    
* Then, I will use two different classifiers and measure the performance. I will use the accuracy based on 10-fold cross-validation on the training data as suggested. 
    * Classifier 1: Regularized logistic regression will be used because it is a simple classifier and may perform good when linear decision boundaries exist. I will tune penalty coefficient.
    * Classifier 2: Random forest will be used because it can learn non-linear decision boundaries and performs well in binary classification problems. Also, it can handle outliers and have a natural feature selection mechanism to overcome curse of dimensionality. I will tune maximum tree depth, number of trees, ratio of features and rows to be used.
    
[1] Küçükaşcı, Emel Şeyma,and Mustafa Gökçe Baydoğan. "Bag encoding strategies in multiple instance learning problems."Information Sciences 467 (2018): 559-578.]

### Form 10 folds for cross validation

Same folds should be used in all models in order to have healty comparison between the alternatives.

In [7]:
target_df['rand'] = np.random.rand(target_df.shape[0])
target_df.sort_values(by='rand',inplace=True)
target_df['rank'] = target_df.groupby('Bag class').cumcount()+1

We have 45 bags in class 0 and 47 bags in class 1. So we will have 3 folds with 4 class 0 and 4 class 1 instances + 2 folds with 4 class 0 and 5 class 1 instances + 5 folds with 5 class 0 and 5 class 1 instances 

In [8]:
def class0_folds(x):
    y = 0
    if x<=4:
        y = 1
    elif x<=8:
        y = 2
    elif x<=12:
        y = 3
    elif x<=16:
        y = 4
    elif x<=20:
        y = 5
    elif x<=25:
        y = 6
    elif x<=30:
        y = 7
    elif x<=35:
        y = 8
    elif x<=40:
        y = 9
    elif x<=45:
        y = 10
    return y

def class1_folds(x):
    y = 0
    if x<=4:
        y = 1
    elif x<=8:
        y = 2
    elif x<=12:
        y = 3
    elif x<=17:
        y = 4
    elif x<=22:
        y = 5
    elif x<=27:
        y = 6
    elif x<=32:
        y = 7
    elif x<=37:
        y = 8
    elif x<=42:
        y = 9
    elif x<=47:
        y = 10
    return y

In [9]:
target_df['class0_fold'] = target_df['rank'].apply(lambda x: class0_folds(x))
target_df['class1_fold'] = target_df['rank'].apply(lambda x: class1_folds(x))
target_df['fold'] = np.where(target_df['Bag class']==0,target_df['class0_fold'],target_df['class1_fold'])
target_df.drop(['rand','rank','class0_fold','class1_fold'],axis=1,inplace=True)

In [10]:
target_df.groupby('fold').agg({'Bag class':['count','sum',np.mean]})

Unnamed: 0_level_0,Bag class,Bag class,Bag class
Unnamed: 0_level_1,count,sum,mean
fold,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,8,4,0.5
2,8,4,0.5
3,8,4,0.5
4,9,5,0.555556
5,9,5,0.555556
6,10,5,0.5
7,10,5,0.5
8,10,5,0.5
9,10,5,0.5
10,10,5,0.5


In [11]:
target_df = pd.merge(target_df.reset_index(),target_df.groupby('fold').agg({'Bag class':['count']}),how='left',on='fold')
target_df.set_index('Bag Id',inplace=True)
target_df.columns = ['Bag class','fold','fold_count']
target_df.head()

Unnamed: 0_level_0,Bag class,fold,fold_count
Bag Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51,0,1,8
86,0,1,8
58,0,1,8
61,0,1,8
31,1,1,8


In [12]:
target_df.sort_index(inplace=True)

### Alternative 1

* Calculate the statistics

In [13]:
statistics_df = df.groupby('Bag Id').describe(percentiles=[.01,.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,.95,.99]).T.reset_index()
statistics_df

Bag Id,level_0,level_1,1,2,3,4,5,6,7,8,...,83,84,85,86,87,88,89,90,91,92
0,Bag class,count,4.00,4.00,2.00,3.00,4.0,2.00,2.00,2.00,...,2.0,2.00,2.0,2.00,4.00,16.0,4.00,4.0,3.00,8.00
1,Bag class,mean,1.00,1.00,1.00,1.00,1.0,1.00,1.00,1.00,...,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.00
2,Bag class,std,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,...,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.00
3,Bag class,min,1.00,1.00,1.00,1.00,1.0,1.00,1.00,1.00,...,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.00
4,Bag class,1%,1.00,1.00,1.00,1.00,1.0,1.00,1.00,1.00,...,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3001,Feature 166,80%,30.40,27.20,41.40,35.60,86.0,65.20,66.40,30.20,...,51.0,40.60,26.0,27.80,90.60,121.0,76.80,73.0,50.40,87.20
3002,Feature 166,90%,30.70,28.10,49.70,36.80,101.0,73.10,74.70,31.60,...,62.0,48.80,26.0,27.90,91.80,121.5,77.40,76.0,77.20,96.30
3003,Feature 166,95%,30.85,28.55,53.85,37.40,108.5,77.05,78.85,32.30,...,67.5,52.90,26.0,27.95,92.40,122.5,77.70,77.5,90.60,96.65
3004,Feature 166,99%,30.97,28.91,57.17,37.88,114.5,80.21,82.17,32.86,...,71.9,56.18,26.0,27.99,92.88,123.7,77.94,78.7,101.32,96.93


* Here statistics of target variable should be dropped
* Also counts are the same for all features, it is enough the keep only one of them

In [14]:
statistics_df['flter_ind_1'] = np.where((statistics_df['level_0']=='Bag class')&(statistics_df['level_1']!='count'),1,0)
statistics_df['flter_ind_2'] = np.where((statistics_df['level_0']!='Bag class')&(statistics_df['level_1']=='count'),1,0)
statistics_df = statistics_df[(statistics_df['flter_ind_1']==0)&(statistics_df['flter_ind_2']==0)]
statistics_df.drop(['flter_ind_1','flter_ind_2'],axis=1,inplace=True)
statistics_df.head()

Bag Id,level_0,level_1,1,2,3,4,5,6,7,8,...,83,84,85,86,87,88,89,90,91,92
0,Bag class,count,4.0,4.0,2.0,3.0,4.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,4.0,16.0,4.0,4.0,3.0,8.0
19,Feature 1,mean,42.0,42.0,42.0,42.666667,61.5,46.5,47.5,41.5,...,41.0,42.0,42.0,41.5,73.25,4.9375,23.5,24.5,38.0,44.375
20,Feature 1,std,0.0,0.0,2.828427,2.081666,19.807406,0.707107,0.707107,2.12132,...,2.828427,2.828427,0.0,0.707107,38.681391,11.635972,5.196152,4.932883,7.81025,5.853875
21,Feature 1,min,42.0,42.0,40.0,41.0,41.0,46.0,47.0,40.0,...,39.0,40.0,42.0,41.0,43.0,-7.0,19.0,19.0,33.0,38.0
22,Feature 1,1%,42.0,42.0,40.04,41.02,41.27,46.01,47.01,40.03,...,39.04,40.04,42.0,41.01,43.48,-7.0,19.0,19.09,33.02,38.07


* Organize the bag features of alternative 1

In [15]:
bag_features_1_df = statistics_df.T
bag_features_1_df.head()

Unnamed: 0_level_0,0,19,20,21,22,23,24,25,26,27,...,2996,2997,2998,2999,3000,3001,3002,3003,3004,3005
Bag Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
level_0,Bag class,Feature 1,Feature 1,Feature 1,Feature 1,Feature 1,Feature 1,Feature 1,Feature 1,Feature 1,...,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166,Feature 166
level_1,count,mean,std,min,1%,5%,10%,20%,30%,40%,...,30%,40%,50%,60%,70%,80%,90%,95%,99%,max
1,4.0,42.0,0.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,30.0,30.0,30.0,30.0,30.1,30.4,30.7,30.85,30.97,31.0
2,4.0,42.0,0.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,25.9,26.0,26.0,26.0,26.3,27.2,28.1,28.55,28.91,29.0
3,2.0,42.0,2.828427,40.0,40.04,40.2,40.4,40.8,41.2,41.6,...,-0.1,8.2,16.5,24.8,33.1,41.4,49.7,53.85,57.17,58.0


In [16]:
bag_features_1_df = bag_features_1_df[~bag_features_1_df.index.isin(['level_0','level_1'])]
bag_features_1_df.columns = ['F_'+str(x) for x in range(1,bag_features_1_df.shape[1]+1)]
bag_features_1_df.head()

Unnamed: 0_level_0,F_1,F_2,F_3,F_4,F_5,F_6,F_7,F_8,F_9,F_10,...,F_2814,F_2815,F_2816,F_2817,F_2818,F_2819,F_2820,F_2821,F_2822,F_2823
Bag Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,42.0,0.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,30.0,30.0,30.0,30.0,30.1,30.4,30.7,30.85,30.97,31.0
2,4.0,42.0,0.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,25.9,26.0,26.0,26.0,26.3,27.2,28.1,28.55,28.91,29.0
3,2.0,42.0,2.828427,40.0,40.04,40.2,40.4,40.8,41.2,41.6,...,-0.1,8.2,16.5,24.8,33.1,41.4,49.7,53.85,57.17,58.0
4,3.0,42.666667,2.081666,41.0,41.02,41.1,41.2,41.4,41.6,41.8,...,30.0,31.0,32.0,33.2,34.4,35.6,36.8,37.4,37.88,38.0
5,4.0,61.5,19.807406,41.0,41.27,42.35,43.7,46.4,49.1,54.0,...,-9.9,6.8,29.0,51.2,71.0,86.0,101.0,108.5,114.5,116.0


* Train the logistic regression for 10 folds and grid search for alternative parameters

In [17]:
def lr_with_grid_seach(features_df,target_and_fold_df):

    perf_df = pd.DataFrame()

    for fold in range(1,11):
        for penalty in [0.01,0.1,1,2,5,10]:

            X_train = features_df[features_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']!=fold].index))]
            X_test = features_df[features_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']==fold].index))]

            y_train = target_and_fold_df[target_and_fold_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']!=fold].index))][['Bag class']]
            y_test = target_and_fold_df[target_and_fold_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']==fold].index))][['Bag class']]

            X_train.sort_index(inplace=True)
            X_test.sort_index(inplace=True)

            y_train.sort_index(inplace=True)
            y_test.sort_index(inplace=True)

            lr = LogisticRegression(penalty='l2',C=penalty)
            lr.fit(X_train,y_train)

            pred_test = pd.merge(y_test,pd.DataFrame(lr.predict(X_test),columns=['prediction'],index=X_test.index),how='left',left_index=True,right_index=True)
            accuracy = accuracy_score(pred_test['Bag class'],pred_test['prediction'])

            perf_df = pd.concat([perf_df,pd.DataFrame({'fold':[fold],'Penalty coefficient':[penalty],'Test Accuracy':[accuracy]})])
    return perf_df

In [18]:
perf_df_lr_1 = lr_with_grid_seach(bag_features_1_df,target_df)

Calculate overall accuracy of 10-folds for different parameters

In [19]:
# We need number of instances in a fold
perf_df_lr_1 = pd.merge(perf_df_lr_1,target_df.groupby('fold').agg({'Bag class':['count']}),how='left',on='fold')
perf_df_lr_1.columns = list(perf_df_lr_1.columns[:-1])+['instance count']

perf_df_lr_1['number_of_correct_classifications'] = perf_df_lr_1['Test Accuracy']*perf_df_lr_1['instance count']
perf_df_lr_1 = perf_df_lr_1.groupby('Penalty coefficient').agg({'instance count':'sum','number_of_correct_classifications':'sum'})
perf_df_lr_1['Test Accuracy over 10 Folds'] = perf_df_lr_1['number_of_correct_classifications']/perf_df_lr_1['instance count']
perf_df_lr_1.sort_values(by='Test Accuracy over 10 Folds',ascending=False)

Unnamed: 0_level_0,instance count,number_of_correct_classifications,Test Accuracy over 10 Folds
Penalty coefficient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.01,92,80.0,0.869565
0.1,92,80.0,0.869565
1.0,92,80.0,0.869565
2.0,92,80.0,0.869565
5.0,92,80.0,0.869565
10.0,92,80.0,0.869565


* Best parameter is 0.01 as penalty coefficient for L2-regularized logistic regression with alternative 1 features
* Accuracy over 10 folds: 0.869565

* Train the random forest for 10 folds and grid search for alternative parameters

In [20]:
def rf_with_grid_seach(features_df,target_and_fold_df):

    perf_df = pd.DataFrame()

    for fold in range(1,11):
        for max_d in [3,6]:
            for n_est in [50,100]:
                for max_f in [0.5,0.8]:
                    for max_s in [0.5,0.8]:

                        X_train = features_df[features_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']!=fold].index))]
                        X_test = features_df[features_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']==fold].index))]

                        y_train = target_and_fold_df[target_and_fold_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']!=fold].index))][['Bag class']]
                        y_test = target_and_fold_df[target_and_fold_df.index.isin(list(target_and_fold_df[target_and_fold_df['fold']==fold].index))][['Bag class']]

                        X_train.sort_index(inplace=True)
                        X_test.sort_index(inplace=True)

                        y_train.sort_index(inplace=True)
                        y_test.sort_index(inplace=True)

                        rf = RandomForestClassifier(max_depth=max_d,n_estimators=n_est,max_features=max_f,max_samples=max_s)
                        rf.fit(X_train,y_train)

                        pred_test = pd.merge(y_test,pd.DataFrame(rf.predict(X_test),columns=['prediction'],index=X_test.index),how='left',left_index=True,right_index=True)
                        accuracy = accuracy_score(pred_test['Bag class'],pred_test['prediction'])

                        perf_df = pd.concat([perf_df,pd.DataFrame({'fold':[fold],'maximum tree depth':[max_d],'number of trees':[n_est],'ratio of features in a tree':[max_f],'ratio of rows in a tree':[max_s],'Test Accuracy':[accuracy]})])
    return perf_df

In [21]:
perf_df_rf_1 = rf_with_grid_seach(bag_features_1_df,target_df)

Calculate overall accuracy of 10-folds for different parameters

In [22]:
perf_df_rf_1 = pd.merge(perf_df_rf_1,target_df.groupby('fold').agg({'Bag class':['count']}),how='left',on='fold')
perf_df_rf_1.columns = list(perf_df_rf_1.columns[:-1])+['instance count']

perf_df_rf_1['number_of_correct_classifications'] = perf_df_rf_1['Test Accuracy']*perf_df_rf_1['instance count']
perf_df_rf_1 = perf_df_rf_1.groupby(['maximum tree depth','number of trees','ratio of features in a tree','ratio of rows in a tree']).agg({'instance count':'sum','number_of_correct_classifications':'sum'})
perf_df_rf_1['Test Accuracy over 10 Folds'] = perf_df_rf_1['number_of_correct_classifications']/perf_df_rf_1['instance count']
perf_df_rf_1.sort_values(by='Test Accuracy over 10 Folds',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,instance count,number_of_correct_classifications,Test Accuracy over 10 Folds
maximum tree depth,number of trees,ratio of features in a tree,ratio of rows in a tree,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,50,0.5,0.8,92,78.0,0.847826
6,100,0.5,0.8,92,78.0,0.847826
3,50,0.5,0.8,92,77.0,0.836957
3,100,0.8,0.8,92,77.0,0.836957
6,100,0.5,0.5,92,77.0,0.836957
3,50,0.8,0.8,92,76.0,0.826087
6,100,0.8,0.8,92,76.0,0.826087
3,50,0.5,0.5,92,75.0,0.815217
3,100,0.8,0.5,92,75.0,0.815217
6,50,0.8,0.5,92,74.0,0.804348


* Best parameter set is as follows for random forest with alternative 1 features
    * maximum tree depth = 6
    * number of trees	= 50
    * ratio of features in a tree	= 0.5
    * ratio of rows in a tree	= 0.8		

* Accuracy over 10 folds: 0.847826

### Alternative 2

* I will fit two KMeans: one for class 1 other for class 0
* At the end, I will have 2*k features for a bag
* I will tune for k

In [23]:
def bag_representation_two_stage_kmeans(df,k):

    kmeans_0 = KMeans(n_clusters=k)
    kmeans_1 = KMeans(n_clusters=k)

    features_0 = df[df['Bag class']==0].drop(['Bag Id','Bag class'],axis=1).copy()
    features_1 = df[df['Bag class']==1].drop(['Bag Id','Bag class'],axis=1).copy()

    kmeans_0.fit(features_0)
    kmeans_1.fit(features_1)

    kmeans_0_features = pd.DataFrame(kmeans_0.predict(df.drop(['Bag Id','Bag class'],axis=1)), columns = ['Class0_cluster'],index=df.index)
    kmeans_0_features['Class0_cluster'] = kmeans_0_features['Class0_cluster'].astype(str)
    kmeans_0_features = pd.get_dummies(kmeans_0_features)
    kmeans_1_features = pd.DataFrame(kmeans_1.predict(df.drop(['Bag Id','Bag class'],axis=1)), columns = ['Class1_cluster'],index=df.index)
    kmeans_1_features['Class1_cluster'] = kmeans_1_features['Class1_cluster'].astype(str)
    kmeans_1_features = pd.get_dummies(kmeans_1_features)

    kmeans_features = pd.merge(df[['Bag Id']],kmeans_0_features,how='left',left_index=True, right_index=True)
    kmeans_features = pd.merge(kmeans_features,kmeans_1_features,how='left',left_index=True, right_index=True)

    kmeans_features = kmeans_features.groupby('Bag Id').sum() #number of instances in each cluster

    return kmeans_features

* Try for different k and at the same time tuning for logistic regression

In [24]:
perf_df_lr_2 = pd.DataFrame()

for k in [3,6,10,20]:
    kmeans_features = bag_representation_two_stage_kmeans(df,k)
    perf_df_lr_2_tmp = lr_with_grid_seach(kmeans_features,target_df)
    perf_df_lr_2_tmp['K'] = k
    
    perf_df_lr_2 = pd.concat([perf_df_lr_2,perf_df_lr_2_tmp])
    
perf_df_lr_2   

Unnamed: 0,fold,Penalty coefficient,Test Accuracy,K
0,1,0.01,0.500,3
0,1,0.10,0.375,3
0,1,1.00,0.375,3
0,1,2.00,0.375,3
0,1,5.00,0.375,3
...,...,...,...,...
0,10,0.10,1.000,20
0,10,1.00,1.000,20
0,10,2.00,1.000,20
0,10,5.00,0.800,20


Calculate overall accuracy of 10-folds for different parameters

In [25]:
# We need number of instances in a fold
perf_df_lr_2 = pd.merge(perf_df_lr_2,target_df.groupby(['fold']).agg({'Bag class':['count']}),how='left',on='fold')
perf_df_lr_2.columns = list(perf_df_lr_2.columns[:-1])+['instance count']

perf_df_lr_2['number_of_correct_classifications'] = perf_df_lr_2['Test Accuracy']*perf_df_lr_2['instance count']
perf_df_lr_2 = perf_df_lr_2.groupby(['Penalty coefficient','K']).agg({'instance count':'sum','number_of_correct_classifications':'sum'})
perf_df_lr_2['Test Accuracy over 10 Folds'] = perf_df_lr_2['number_of_correct_classifications']/perf_df_lr_2['instance count']
perf_df_lr_2.sort_values(by='Test Accuracy over 10 Folds',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,instance count,number_of_correct_classifications,Test Accuracy over 10 Folds
Penalty coefficient,K,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10.0,20,92,77.0,0.836957
2.0,20,92,75.0,0.815217
1.0,20,92,75.0,0.815217
5.0,20,92,73.0,0.793478
0.1,20,92,70.0,0.76087
10.0,10,92,63.0,0.684783
5.0,10,92,62.0,0.673913
0.01,10,92,60.0,0.652174
1.0,10,92,59.0,0.641304
2.0,10,92,58.0,0.630435


* Best parameter set is as follows for l2-regularized logistic regression with alternative 2 features
    * penalty coefficient = 10.00
    * k = 20	

* Accuracy over 10 folds: 0.836957

* Try for different k and at the same time tuning for random forest

In [26]:
perf_df_rf_2 = pd.DataFrame()

for k in [3,6,10,20]:
    kmeans_features = bag_representation_two_stage_kmeans(df,k)
    perf_df_rf_2_tmp = rf_with_grid_seach(kmeans_features,target_df)
    perf_df_rf_2_tmp['K'] = k
    
    perf_df_rf_2 = pd.concat([perf_df_rf_2,perf_df_rf_2_tmp])
    
perf_df_rf_2   

Unnamed: 0,fold,maximum tree depth,number of trees,ratio of features in a tree,ratio of rows in a tree,Test Accuracy,K
0,1,3,50,0.5,0.5,0.625,3
0,1,3,50,0.5,0.8,0.625,3
0,1,3,50,0.8,0.5,0.625,3
0,1,3,50,0.8,0.8,0.625,3
0,1,3,100,0.5,0.5,0.625,3
...,...,...,...,...,...,...,...
0,10,6,50,0.8,0.8,1.000,20
0,10,6,100,0.5,0.5,0.900,20
0,10,6,100,0.5,0.8,0.900,20
0,10,6,100,0.8,0.5,0.900,20


Calculate overall accuracy of 10-folds for different parameters

In [27]:
perf_df_rf_2 = pd.merge(perf_df_rf_2,target_df.groupby('fold').agg({'Bag class':['count']}),how='left',on='fold')
perf_df_rf_2.columns = list(perf_df_rf_2.columns[:-1])+['instance count']

perf_df_rf_2['number_of_correct_classifications'] = perf_df_rf_2['Test Accuracy']*perf_df_rf_2['instance count']
perf_df_rf_2 = perf_df_rf_2.groupby(['K','maximum tree depth','number of trees','ratio of features in a tree','ratio of rows in a tree']).agg({'instance count':'sum','number_of_correct_classifications':'sum'})
perf_df_rf_2['Test Accuracy over 10 Folds'] = perf_df_rf_2['number_of_correct_classifications']/perf_df_rf_2['instance count']
perf_df_rf_2.sort_values(by='Test Accuracy over 10 Folds',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,instance count,number_of_correct_classifications,Test Accuracy over 10 Folds
K,maximum tree depth,number of trees,ratio of features in a tree,ratio of rows in a tree,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,6,50,0.8,0.5,92,73.0,0.793478
10,6,50,0.5,0.5,92,73.0,0.793478
20,6,50,0.8,0.8,92,72.0,0.782609
20,6,50,0.5,0.8,92,72.0,0.782609
10,6,50,0.5,0.8,92,72.0,0.782609
...,...,...,...,...,...,...,...
3,6,50,0.5,0.5,92,55.0,0.597826
3,6,100,0.8,0.5,92,55.0,0.597826
3,3,100,0.5,0.8,92,54.0,0.586957
3,3,50,0.8,0.5,92,54.0,0.586957


* Best parameter set is as follows for random forest with alternative 2 features
    * maximum tree depth = 6
    * number of trees	= 50
    * ratio of features in a tree	= 0.8
    * ratio of rows in a tree	= 0.5	
    * k = 10
    
* Accuracy over 10 folds: 0.793478

### Alternative 3

In [36]:
def bag_representation_random_tree(df,target_and_fold_df,tree_count,h):
    features_df = target_and_fold_df.copy()
    for tree in range(1,tree_count+1):
        df_tree = df.copy()
        feature_list = [x for x in df.columns if x not in ['Bag class', 'Bag Id']]

        for iteration in range(1,h+1):
            split_feature = feature_list[np.random.randint(0,len(feature_list))]
            feature_list = [x for x in feature_list if x != split_feature]

            range_max= df[split_feature].max()
            range_min= df[split_feature].min()

            split_value = np.random.randint(range_min,range_max+1)

            df_tree['LEAF_'+str(iteration)] = np.where(df_tree[split_feature]<=split_value,1,0)

        df_tree = df_tree[[x for x in df_tree.columns if (x in ['Bag class','Bag Id']) or (x[0:4]=='LEAF')]]

        df_tree['NodeRepresentation'] = ''

        leaf_list = [x for x in df_tree.columns if x not in ['Bag class', 'Bag Id','NodeRepresentation']]

        for col in leaf_list:
            df_tree['NodeRepresentation'] = df_tree['NodeRepresentation']+df_tree[col].astype(str)

        lst = list(itertools.product([0, 1], repeat=h))
        lst2 = []
        for i in range(0,len(lst)):
            lst_tmp = ''.join(map(str, lst[i]))
            lst2.append(lst_tmp)

        lst3 = list(range(1,len(lst2)+1))
        node_dictionary = {lst2[i]: str(lst3[i]) for i in range(len(lst3))}

        df_tree['Node'] = df_tree['NodeRepresentation'].apply(lambda x: node_dictionary[x])

        df_tree = pd.get_dummies(df_tree[['Bag Id','Node']])
        df_tree = df_tree.groupby('Bag Id').sum()
        df_tree.columns = ['T'+str(tree)+'_'+x for x in df_tree.columns]

        features_df = pd.merge(features_df,df_tree,how='left',left_index=True, right_index=True)
    features_df.drop(['Bag class','fold','fold_count'],axis=1, inplace=True)
    return features_df


* Try for different h and tree count; and at the same time tuning for logistic regression

In [37]:
perf_df_lr_3 = pd.DataFrame()

for tree in [5,10,20]:
    for h in [3,10]:
        rt_features = bag_representation_random_tree(df,target_df,tree,h)
        perf_df_lr_3_tmp = lr_with_grid_seach(rt_features,target_df)
        perf_df_lr_3_tmp['tree count'] = tree
        perf_df_lr_3_tmp['feature count (tree depth)'] = h

        perf_df_lr_3 = pd.concat([perf_df_lr_3,perf_df_lr_3_tmp])

perf_df_lr_3   

Unnamed: 0,fold,Penalty coefficient,Test Accuracy,tree count,feature count (tree depth)
0,1,0.01,0.625,5,3
0,1,0.10,0.625,5,3
0,1,1.00,0.750,5,3
0,1,2.00,0.750,5,3
0,1,5.00,0.750,5,3
...,...,...,...,...,...
0,10,0.10,0.900,20,10
0,10,1.00,0.900,20,10
0,10,2.00,0.900,20,10
0,10,5.00,0.900,20,10


Calculate overall accuracy of 10-folds for different parameters

In [38]:
# We need number of instances in a fold
perf_df_lr_3 = pd.merge(perf_df_lr_3,target_df.groupby(['fold']).agg({'Bag class':['count']}),how='left',on='fold')
perf_df_lr_3.columns = list(perf_df_lr_3.columns[:-1])+['instance count']

perf_df_lr_3['number_of_correct_classifications'] = perf_df_lr_3['Test Accuracy']*perf_df_lr_3['instance count']
perf_df_lr_3 = perf_df_lr_3.groupby(['Penalty coefficient','tree count','feature count (tree depth)']).agg({'instance count':'sum','number_of_correct_classifications':'sum'})
perf_df_lr_3['Test Accuracy over 10 Folds'] = perf_df_lr_3['number_of_correct_classifications']/perf_df_lr_3['instance count']
perf_df_lr_3.sort_values(by='Test Accuracy over 10 Folds',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,instance count,number_of_correct_classifications,Test Accuracy over 10 Folds
Penalty coefficient,tree count,feature count (tree depth),Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.1,20,10,92,78.0,0.847826
10.0,20,10,92,77.0,0.836957
10.0,5,10,92,77.0,0.836957
2.0,20,10,92,77.0,0.836957
5.0,20,10,92,77.0,0.836957
1.0,20,10,92,77.0,0.836957
0.01,20,10,92,76.0,0.826087
2.0,5,10,92,76.0,0.826087
5.0,5,10,92,76.0,0.826087
1.0,5,10,92,75.0,0.815217


* Best parameter set is as follows for l2-regularized logistic regression with alternative 3 features
    * penalty coefficient = 0.10
    * tree count = 20
    * feature count (tree depth) = 10

* Accuracy over 10 folds: 0.847826

* Try for different h and tree count; and at the same time tuning for random forest

In [39]:
perf_df_rf_3 = pd.DataFrame()

for tree in [5,10,20]:
    for h in [3,10]:
        rt_features = bag_representation_random_tree(df,target_df,tree,h)
        perf_df_rf_3_tmp = rf_with_grid_seach(rt_features,target_df)
        perf_df_rf_3_tmp['tree count'] = tree
        perf_df_rf_3_tmp['feature count (tree depth)'] = h

        perf_df_rf_3 = pd.concat([perf_df_rf_3,perf_df_rf_3_tmp])

perf_df_rf_3   

Unnamed: 0,fold,maximum tree depth,number of trees,ratio of features in a tree,ratio of rows in a tree,Test Accuracy,tree count,feature count (tree depth)
0,1,3,50,0.5,0.5,0.750,5,3
0,1,3,50,0.5,0.8,0.875,5,3
0,1,3,50,0.8,0.5,0.625,5,3
0,1,3,50,0.8,0.8,0.750,5,3
0,1,3,100,0.5,0.5,0.625,5,3
...,...,...,...,...,...,...,...,...
0,10,6,50,0.8,0.8,0.800,20,10
0,10,6,100,0.5,0.5,0.900,20,10
0,10,6,100,0.5,0.8,0.800,20,10
0,10,6,100,0.8,0.5,0.800,20,10


Calculate overall accuracy of 10-folds for different parameters

In [40]:
perf_df_rf_3 = pd.merge(perf_df_rf_3,target_df.groupby('fold').agg({'Bag class':['count']}),how='left',on='fold')
perf_df_rf_3.columns = list(perf_df_rf_3.columns[:-1])+['instance count']

perf_df_rf_3['number_of_correct_classifications'] = perf_df_rf_3['Test Accuracy']*perf_df_rf_3['instance count']
perf_df_rf_3 = perf_df_rf_3.groupby(['tree count','feature count (tree depth)','maximum tree depth','number of trees','ratio of features in a tree','ratio of rows in a tree']).agg({'instance count':'sum','number_of_correct_classifications':'sum'})
perf_df_rf_3['Test Accuracy over 10 Folds'] = perf_df_rf_3['number_of_correct_classifications']/perf_df_rf_3['instance count']
perf_df_rf_3.sort_values(by='Test Accuracy over 10 Folds',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,instance count,number_of_correct_classifications,Test Accuracy over 10 Folds
tree count,feature count (tree depth),maximum tree depth,number of trees,ratio of features in a tree,ratio of rows in a tree,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,3,6,50,0.8,0.8,92,80.0,0.869565
10,3,6,100,0.5,0.8,92,79.0,0.858696
10,3,6,100,0.8,0.8,92,78.0,0.847826
10,10,3,50,0.5,0.5,92,77.0,0.836957
10,3,3,50,0.8,0.5,92,77.0,0.836957
...,...,...,...,...,...,...,...,...
20,3,3,50,0.8,0.8,92,68.0,0.739130
5,3,6,50,0.5,0.5,92,68.0,0.739130
5,3,3,50,0.8,0.5,92,68.0,0.739130
5,10,3,100,0.5,0.8,92,67.0,0.728261


* Best parameter set is as follows for random forest with alternative 2 features
    * maximum tree depth = 6
    * number of trees	= 50
    * ratio of features in a tree	= 0.8
    * tree count = 10
    * feature count (tree depth) = 3
    
* Accuracy over 10 folds: 0.869565

### Comments

Summarize 10-fold test accuracies:

* Alternative 1 & Classifier 1: 0.869565
* Alternative 1 & Classifier 2: 0.847826
* Alternative 2 & Classifier 1: 0.836957
* Alternative 2 & Classifier 2: 0.793478
* Alternative 3 & Classifier 1: 0.847826
* Alternative 3 & Classifier 2: 0.869565

* Both Alternative 1 & Classifier 1 and Alternative 3 and Classifier 2 have the highest accuracy. 
* It is interesting because Alternative 1 & Classifier 1 is the simplest approach and Alternative 3 and Classifier 2 is the most complex one and both perform the same. Normally, I expect that Alternative 3 & Classifier 2 performs better. This may be caused by that sample size of the data set is very small as there are only 92 bags. 