In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/lib/kaggle/gcp.py
/kaggle/input/datacontest-prml/Dataset_1_Training.csv
/kaggle/input/datacontest-prml/Dataset_2_Testing.csv
/kaggle/input/datacontest-prml/Dataset_1_Testing.csv
/kaggle/input/datacontest-prml/Dataset_2_Training.csv
/kaggle/input/prml-data-contest-jul-2021-rb-section/dummy_submission.csv
/kaggle/working/__notebook_source__.ipynb


In [3]:
# dataset path
train1_path = '/kaggle/input/datacontest-prml/Dataset_1_Training.csv'
train2_path = '/kaggle/input/datacontest-prml/Dataset_2_Training.csv'
test1_path = '/kaggle/input/datacontest-prml/Dataset_1_Testing.csv'
test2_path = '/kaggle/input/datacontest-prml/Dataset_2_Testing.csv'

# random seed
seed = 1

In [4]:
def load_data(path):
    df = pd.read_csv(path)
    df = df.iloc[:, 1:]
    return df.T

def load_train_data(path, till, lstart=1):
    df = load_data(path)
    X, Y = df.iloc[:, :till].reset_index(drop=True), df.iloc[:, till:].reset_index(drop=True)
    _, nlabels = Y.shape
    names = {(till+i):f'CO{i+lstart}' for i in range(0, nlabels)}
    Y = Y.rename(columns=names)
    return X, Y 

def load_test_data(path):
    df = load_data(path)
    X = df.reset_index(drop=True)
    return X

In [5]:
def normalize(X, std=True):
    X = X - X.mean(axis=0)
    X = X/X.std() if std else X
    return X

In [6]:
def class_balance(X, Y, seed=1):
    x, y = [X], [Y]
    max_value = Y.value_counts().max()
    for cl, grp in X.groupby(Y):
        sample_size = max_value-len(grp)
        if sample_size > 0:
            x.append(grp.sample(sample_size, replace=True, random_state=int(cl)+seed))
            y.append(pd.Series([cl for i in range(sample_size)]))
    x = pd.concat(x).sample(frac=1, random_state=seed).reset_index(drop=True)
    y = pd.concat(y).sample(frac=1, random_state=seed).reset_index(drop=True)
    return x, y

In [7]:
# Training Dataset 1
X1, Y1 = load_train_data(train1_path, till=22283)
CO1, CO2 = Y1['CO1'], Y1['CO2']
# Xtrain1, Xtest1 = normalize(Xtrain1), normalize(Xtest1)

# Training Dataset 2
X2, Y2 = load_train_data(train2_path, till=54675, lstart=3)
CO3, CO4, CO5, CO6 = Y2['CO3'], Y2['CO4'], Y2['CO5'], Y2['CO6']
# Xtrain2, Xtest2 = normalize(Xtrain2), normalize(Xtest2)

# Test Dataset 1
X1_test = load_test_data(test1_path)
# Test Dataset 2
X2_test = load_test_data(test2_path)

In [8]:
from sklearn.feature_selection import VarianceThreshold

def do_variance_thres(train, test, th=0.1):
    var = VarianceThreshold(threshold=th)
    var.fit(train)
    new_train = pd.DataFrame(data=var.transform(train), index = train.index)
    new_test = pd.DataFrame(data=var.transform(test), index = test.index)
    return (new_train, new_test)

In [9]:
from sklearn.feature_selection import SelectKBest

def do_univariate(xtrain, xtest, ytrain, n_features=10):
    uni = SelectKBest(k=n_features)
    uni.fit(xtrain, ytrain)
    new_train = pd.DataFrame(data=uni.transform(xtrain), index = xtrain.index)
    new_test = pd.DataFrame(data=uni.transform(xtest), index = xtest.index)
    return (new_train, new_test)

In [10]:
## Preprocessing
from sklearn.decomposition import PCA

def do_pca(train, test, n=10):
    pca = PCA(n_components=n, whiten=True, random_state=seed).fit(train)
    new_train = pd.DataFrame(data=np.dot(train, pca.components_.T),
                             index = train.index)
    new_test = pd.DataFrame(data = np.dot(test, pca.components_.T),
                            index = test.index)
    return (new_train, new_test)

In [11]:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler

def do_max_abs_scaling(train, test):
    mu = train.mean(axis=0)
    scale = MaxAbsScaler().fit(train-mu)
    new_train = pd.DataFrame(data=scale.transform(train-mu), index=train.index)
    new_test = pd.DataFrame(data=scale.transform(test-mu), index=test.index)
    return (new_train, new_test)

def do_min_max_scaling(train, test):
    scale = MinMaxScaler().fit(train)
    new_train = pd.DataFrame(data=scale.transform(train), index=train.index)
    new_test = pd.DataFrame(data=scale.transform(test), index=test.index)   
    return (new_train, new_test)

def do_standard(train, test):
    scale = StandardScaler().fit(train)
    new_train = pd.DataFrame(data=scale.transform(train), index=train.index)
    new_test = pd.DataFrame(data=scale.transform(test), index=test.index)   
    return (new_train, new_test)

In [12]:
## Preprocessing
## Very expensive way of feature selection

# from sklearn.feature_selection import SequentialFeatureSelector
# from sklearn.linear_model import LogisticRegression

# def forward_step_selection(model, X, Y):
#     sfs = SequentialFeatureSelector(model, n_features_to_select=0.2, direction='forward', scoring='balanced_accuracy')
#     sfs.fit(X, Y)
#     return sfs

# model = LogisticRegression(random_state=seed, C=1.0)
# sfs = forward_step_selection(model, Xtrain1, Ytrain1.iloc[:,0])

In [13]:
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, matthews_corrcoef

def num_mistakes(pred, expected):
    miss = 0
    for p,e in zip(pred, expected):
        if p != e:
            miss += 1
    total = len(expected)
    return miss, total

In [14]:
from sklearn.model_selection import StratifiedKFold

def test_preprocessing(Xtrain, Ytrain, kwargs):
    pca = kwargs.pop('pca', None)
    var = kwargs.pop('var', None)
    uni = kwargs.pop('uni', None)
    max_scale = kwargs.pop('max_scale', False)
    range_scale = kwargs.pop('range_scale', False)
    std_scale = kwargs.pop('std_scale', False)
    Xtest = kwargs.pop('test_data', pd.DataFrame())
    assert not Xtest.empty
    if pca != None:
        num_features = None if pca == -1 else pca # None here means use all features
        Xtrain, Xtest = do_pca(Xtrain, Xtest, num_features)
    if var != None:
        Xtrain, Xtest = do_variance_thres(Xtrain, Xtest, var)
    if uni != None:
        Xtrain, Xtest = do_univariate(Xtrain, Xtest, Ytrain, uni)
    if max_scale:
        Xtrain, Xtest = do_max_abs_scaling(Xtrain, Xtest)
    if range_scale:
        Xtrain, Xtest = do_min_max_scaling(Xtrain, Xtest)
    if std_scale:
        Xtrain, Xtest = do_standard(Xtrain, Xtest)
    return (Xtrain, Xtest)

def with_cross_validation(model_func):
    def wrap(X, Y, label, *args, **kwargs):
        is_test = kwargs.pop('test', False)
        rescale = kwargs.pop('rescale', True)
        if is_test:
            Xtrain, Xtest = test_preprocessing(X, Y, kwargs)
            Ytrain = Y
            if rescale:
                upsample_X, upsample_Y = class_balance(Xtrain, Ytrain, seed=20)
            else:
                upsample_X, upsample_Y = Xtrain, Ytrain
            assert upsample_X.shape[0] == upsample_Y.shape[0]
            model, model_name = model_func(upsample_X, upsample_Y, *args, **kwargs)
            Ytrain_pred = model.predict(Xtrain)
            print(f'{model_name} MCC: {matthews_corrcoef(Ytrain, Ytrain_pred)}')
            Ypred = model.predict(Xtest)
            assert Xtest.shape[0] == Ypred.shape[0]
            return Ypred
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        avg_test_mcc = 0; cnt = 0
        avg_train_mcc = 0
        pca = kwargs.pop('pca', None)
        var = kwargs.pop('var', None)
        uni = kwargs.pop('uni', None)
        max_scale = kwargs.pop('max_scale', False)
        range_scale = kwargs.pop('range_scale', False)
        std_scale = kwargs.pop('std_scale', False)
        for train_index, test_index in skf.split(X, Y):
            Xtrain, Xtest, Ytrain, Ytest = X.iloc[train_index], X.iloc[test_index], Y.iloc[train_index], Y.iloc[test_index]
            assert Xtrain.shape[0] == Ytrain.shape[0]
            assert Xtest.shape[0] == Ytest.shape[0]
            
            if pca != None:
                num_features = None if pca == -1 else pca # None here means use all features
                Xtrain, Xtest = do_pca(Xtrain, Xtest, num_features)
            if var != None:
                Xtrain, Xtest = do_variance_thres(Xtrain, Xtest, var)
            if uni != None:
                Xtrain, Xtest = do_univariate(Xtrain, Xtest, Ytrain, uni)
            if max_scale:
                Xtrain, Xtest = do_max_abs_scaling(Xtrain, Xtest)
            if range_scale:
                Xtrain, Xtest = do_min_max_scaling(Xtrain, Xtest)
            if std_scale:
                Xtrain, Xtest = do_standard(Xtrain, Xtest)
            if rescale:
                upsample_X, upsample_Y = class_balance(Xtrain, Ytrain, seed=cnt)
            else:
                upsample_X, upsample_Y = Xtrain, Ytrain
    
            assert upsample_X.shape[0] == upsample_Y.shape[0]
            model, model_name = model_func(upsample_X, upsample_Y, *args, **kwargs)
            Ytrain_pred = model.predict(Xtrain)
            train_mcc = matthews_corrcoef(Ytrain, Ytrain_pred)
            avg_train_mcc += train_mcc
            Ypred = model.predict(Xtest)
            test_mcc = matthews_corrcoef(Ytest, Ypred)
            avg_test_mcc += test_mcc; cnt += 1;
            cm = confusion_matrix(Ytest, Ypred)
            print(f'{model_name} Train MCC: {train_mcc}, Test MCC: {test_mcc}')
#             print(f'Confusion matrix for {label}')
#             print(cm)
        print(f'{model_name} Average test MCC Score: {avg_test_mcc / cnt}')
        print(f'{model_name} Average train MCC Score: {avg_train_mcc / cnt}')
        print('---------------------------------------------')
    return wrap

In [15]:
# SVM RBF Kernel

from sklearn.svm import SVC

@with_cross_validation
def svm(X, Y, C=1.0, kernel='rbf', degree=3, weight='balanced'):
    mdl = SVC(kernel=kernel, cache_size=300, C=C, degree=degree, class_weight=weight, random_state=seed)
    mdl.fit(X, Y)
    return mdl, "SVM"

In [16]:
from sklearn.ensemble import AdaBoostClassifier

@with_cross_validation
def ada_boost(X, Y, n_esti=50, lr=1.0, base_esti=None):
    adb = AdaBoostClassifier(base_estimator=base_esti, n_estimators=n_esti, learning_rate=lr, random_state=seed)
    adb.fit(X, Y)
    return adb, "AdaBoost"

In [17]:
# logistic regression

from sklearn.linear_model import LogisticRegression

@with_cross_validation
def logistic_regression(X, Y, reg=1.0):
    lr = LogisticRegression(random_state=seed, C=reg, solver='liblinear', penalty='l1')
    lr.fit(X, Y)
    return lr, "LogisticRegression"

In [18]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

@with_cross_validation
def naive_bayes(X, Y):
    nb = GaussianNB()
    nb.fit(X, Y)
    return nb, "NaiveBayes"

In [19]:
# Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

@with_cross_validation
def linear_discriminant(X, Y):
    lda = LinearDiscriminantAnalysis()
    lda.fit(X, Y)
    return lda, "LDA"

In [20]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

@with_cross_validation
def random_forest(X, Y, n_esti=100, depth=2, features=0.5, samples=0.5):
    rf = RandomForestClassifier(n_estimators=n_esti,
                                criterion='entropy',
                                max_depth=depth,
                                max_features=features,
                                bootstrap=True,
                                class_weight='balanced',
                                max_samples=samples,
                                random_state=seed
                               )
    rf.fit(X, Y)
    return rf, "RandomForest"

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

@with_cross_validation
def gradient_classifier(X, Y, n_esti=100, depth=2, sample=1.0, lr=1.0, features=0.5):
    gb = GradientBoostingClassifier(n_estimators = n_esti,
                                    learning_rate = lr,
                                    subsample = sample,
                                    max_depth = depth,
                                    max_features = features,
                                    random_state = seed
                                   )
    gb.fit(X, Y)
    return gb, "GradientBoosting"

In [None]:
# MCC: 0.432 (pca=20)
# logistic_regression(X1, CO1, 'CO1', pca=20, rescale=True)
# MCC: 0.353
# logistic_regression(X1, CO1, 'CO1', var=0.5)
# MCC: 0.391
# logistic_regression(X1, CO1, 'CO1', uni=5)

# MCC: 0.367
# naive_bayes(X1, CO1, 'CO1', pca=10)
# MCC: 0.398
# naive_bayes(X1, CO1, 'CO1', var=1.0)
# MCC: 0.496 (uni=45)
# naive_bayes(X1, CO1, 'CO1', uni=45, rescale=True)

# MCC: 0.466
# linear_discriminant(X1, CO1, 'CO1', pca=20)
# MCC: 0.471
# linear_discriminant(X1, CO1, 'CO1', var=0.5)
# MCC: 0.403
# linear_discriminant(X1, CO1, 'CO1', uni=5)

# MCC: 0.438 (max_depth=2, max_features=0.5, max_samples=0.5)
# random_forest(X1, CO1, 'CO1', pca=10)
# MCC: 0.429 (max_depth=2, max_features=0.5, max_samples=0.5)
# random_forest(X1, CO1, 'CO1', var=1.5)
# MCC: 0.449
# random_forest(X1, CO1, 'CO1', uni=60)

# MCC: 0.418 (max_depth = 2, max_features = 0.5, subsample = 1.0)
# gradient_classifier(X1, CO1, 'CO1', pca=40)
# MCC: (not good performance) (max_depth = 2, max_features = 0.5, subsample = 1.0)
# gradient_classifier(X1, CO1, 'CO1', var=1.7)
# MCC: 0.483
# gradient_classifier(X1, CO1, 'CO1', uni=65, lr=0.7)

# MCC: 0.532 (kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False)
svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False)

In [None]:
CO1.value_counts()

In [None]:
# MCC: 0.427
# logistic_regression(X1, CO2, 'CO2', pca=10)
# MCC: 0.424
# logistic_regression(X1, CO2, 'CO2', var=0.5)

# MCC: 0.357
# naive_bayes(X1, CO2, 'CO2', pca=10)
# MCC: 0.508
# naive_bayes(X1, CO2, 'CO2', uni=20, rescale=True)

# MCC: 0.364
# linear_discriminant(X1, CO2, 'CO2', pca=10)
# MCC: 0.379
# linear_discriminant(X1, CO2, 'CO2', var=0.9)
# MCC: 0.445
# linear_discriminant(X1, CO2, 'CO2', uni=10)

# MCC: 0.399 (max_depth=2, max_features=0.5, max_sample=0.5)
# random_forest(X1, CO2, 'CO2', pca=10)
# MCC: 0.498
# random_forest(X1, CO2, 'CO2', uni=50)

# MCC: 0.470
# gradient_classifier(X1, CO2, 'CO2', pca=40)
# MCC: 0.472
# gradient_classifier(X1, CO2, 'CO2', uni=15, lr=0.5)

# MCC: 0.551 (kernel='poly', degree=3, uni=15, C=1.0, max_scale=True, rescale=False)
# svm(X1, CO2, 'CO2', kernel='poly', degree=3, uni=15, C=1.0, max_scale=True, rescale=False)

In [None]:
CO2.value_counts()

In [None]:
# MCC: 0.316
# logistic_regression(X2, CO3, 'CO3', pca=40)

# MCC: 0.295
# naive_bayes(X2, CO3, 'CO3', pca=10)
# MCC: 0.294
# naive_bayes(X2, CO3, 'CO3', var=2.0)
# MCC: 0.285
# naive_bayes(X2, CO3, 'CO3', uni=40)

# MCC: 0.304
# linear_discriminant(X2, CO3,'CO3', pca=20)
# MCC: 0.329
# linear_discriminant(X2, CO3,'CO3', var=2.0)

# MCC: 0.311 (max_depth=2, max_features=0.5, max_sample=0.5)
# random_forest(X2, CO3, 'CO3', pca=20)
# MCC: 0.310
random_forest(X2, CO3, 'CO3', uni=30)

# MCC: 0.219
# gradient_classifier(X2, CO3, 'CO3', n_esti=50, depth=1, uni=50, rescale=False)

# MCC: 0.361 (pca=25, kernel='poly', degree=2, C=0.8, range_scale=True, rescale=False)
# svm(X2, CO3, 'CO3', pca=25, kernel='poly', degree=2, C=0.8, range_scale=True, rescale=False)

# MCC: 0.354 (uni=35, kernel='rbf', C=0.9, weight={1:2.0, 0:0.7}, range_scale=True, rescale=False)
# svm(X2, CO3, 'CO3', uni=35, kernel='rbf', C=0.9, weight={1:2.0, 0:0.7}, range_scale=True, rescale=False)

# MCC: 0.324
# ada_boost(X2, CO3, 'CO3', n_esti=50, uni=25, lr=0.1, rescale=False)

# MCC: 0.303 (n_esti=20, uni=35, lr=0.5, rescale=False)
# ada_boost(X2, CO3, 'CO3', n_esti=20, uni=35, lr=0.5, rescale=False)

# MCC:  Doesnot perform well
# ada_boost(X2, CO3, 'CO3', n_esti=10, lr=0.5, rescale=False)

In [None]:
len(CO3) / (2*np.bincount(CO3))

In [None]:
CO3.value_counts()

In [None]:
# MCC: 0.098
# logistic_regression(X2, CO4, 'CO4', pca=30)
# MCC: 0.130 (var in the mcc is high)
# logistic_regression(X2, CO4, 'CO4', var=3.5)

# MCC: 0.195
# naive_bayes(X2, CO4, 'CO4', pca=15)
# MCC: 0.218
# naive_bayes(X2, CO4, 'CO4', var=3.5, rescale=True)
# MCC: 0.206 (15)
# naive_bayes(X2, CO4, 'CO4', uni=15, rescale=True)

# MCC: 0.182
# linear_discriminant(X2, CO4, 'CO4', pca=15)
# MCC: 0.205
# linear_discriminant(X2, CO4, 'CO4', var=2.5)

# MCC: 0.204 (pca=15, n_esti=100, depth=2, features=0.5, samples=0.5)
# random_forest(X2, CO4,'CO4', pca=15, n_esti=100, depth=2, features=0.5, samples=0.5, rescale=False)

# MCC: 0.112
# gradient_classifier(X2, CO4, 'CO4', pca=25)
# MCC: 0.203 (uni=25, n_esti=50, depth=1, sample=0.5, lr=1.0, features=0.5, range_scale=True, rescale=True)
# gradient_classifier(X2, CO4, 'CO4', uni=25, n_esti=50, depth=1, sample=0.5, lr=1.0, features=0.5, range_scale=True, rescale=True)

# MCC: 0.142
# svm(X2, CO4, 'CO4', uni=25, kernel='rbf', degree=2, C=0.6, max_scale=True, rescale=False)

# MCC: 
# svm(X2, CO4, 'CO4', uni=15, kernel='rbf', degree=2, C=1.0, range_scale=True, rescale=False)

# MCC: 0.231 (uni=25, n_esti=200, lr=0.1, std_scale=True, rescale=False)
ada_boost(X2, CO4, 'CO4', uni=25, n_esti=200, lr=0.1, std_scale=True, rescale=False)

# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=1, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=1, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
# ada_boost(X2, CO4, 'CO4', base_esti=clf, n_esti=60, pca=12, lr=1.0, rescale=True)

# MCC: All data doesnot perform well
# ada_boost(X2, CO4, 'CO4', n_esti=10, lr=0.5, rescale=False)

In [None]:
len(CO4) / (2*np.bincount(CO4))

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X2, CO4, test_size=0.3, random_state=100)
new_train, new_test = do_variance_thres(xtrain, xtest, 10)

In [None]:
new_train.shape

In [None]:
z = new_train[ytrain == 0]
o = new_train[ytrain==1]

p = z.plot(kind='scatter', x=3, y=5, color='r')
o.plot(kind='scatter', x=3, y=5, color='b', ax=p)

In [None]:
# MCC: 0.846
# logistic_regression(X2, CO5, 'CO5', uni=10, rescale=True)

# MCC: 0.427
# naive_bayes(X2, CO5, 'CO5', pca=-1)

# MCC: 0.153
# linear_discriminant(X2, CO5, 'CO5', pca=20)

# MCC: 0.82 (max_depth=2, max_features=0.3) # cal. using only 1 run
# MCC: 0.440 (max_depth=2, max_features=0.5, pca=50)
# random_forest(X2, CO5, 'CO5', pca=50)

# MCC: 0.838
# svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False)

In [None]:
# MCC: -0.057
# logistic_regression(X2, CO6, 'CO6', pca=-1)
# MCC: 0.089 (high var in score)
# logistic_regression(X2, CO6, 'CO6', var=2.0)

# MCC: 0.085 (variance of the model is very high, but sometimes it yeilds good performance)
# naive_bayes(X2, CO6, 'CO6', pca=15)
# MCC: 0.069
# naive_bayes(X2, CO6, 'CO6', var=0.5)

# MCC: 0.091
# linear_discriminant(X2, CO6, 'CO6', pca=60, std_scale= True)
# MCC: 0.092
# linear_discriminant(X2, CO6, 'CO6', var=0.5)
# MCC: 0.122
# linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, rescale=True)

# MCC: 0.065 (max_depth=2, max_features=0.5)
# random_forest(X2, CO6, 'CO6', pca=10, n_esti=100, depth=2, features=0.5, samples=0.5, rescale=False)
# MCC: 0.113 (high var in score)
# random_forest(X2, CO6, 'CO6', uni=30)

# MCC: 0.088
# gradient_classifier(X2, CO6, 'CO6', pca=10)
# MCC: 0.142 (uni=25, n_esti=200, depth=1, sample=0.5, lr=0.5, features=0.5, std_scale=True)
# gradient_classifier(X2, CO6, 'CO6', uni=25, n_esti=100, depth=1, sample=0.5, lr=0.5, features=0.5, std_scale=True, rescale=True)

# MCC: 
# svm(X2, CO6, 'CO6', uni=10, kernel='rbf', degree=2, C=2.0, std_scale=True, rescale=False)

# MCC:
svm(X2, CO6, 'CO6', uni=25, kernel='rbf', C=1.0, weight={1:0.8, 0:1.2}, range_scale=True, rescale=False)

# MCC: 0.152 (n_esti=10, uni=150, std_scale=True, rescale=False)
# ada_boost(X2, CO6, 'CO6', n_esti=10, uni=150, std_scale=True, rescale=False)

# MCC:
# ada_boost(X2, CO6, 'CO6', n_esti=40, pca=75, lr=0.6, std_scale=True, rescale=True)

# MCC: All data training very poor performance
# ada_boost(X2, CO6, 'CO6', n_esti=10, lr=0.5, rescale=False)

In [None]:
len(CO6)/(2*np.bincount(CO6))

In [None]:
# Submission 1 (0.365) [attempt_final1]
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# CO2 : 0.551
test_CO2 = svm(X1, CO2, 'CO2', kernel='poly', degree=3, uni=15, C=1.0, max_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# CO3 : 0.316
test_CO3 = logistic_regression(X2, CO3, 'CO3', pca=40, test=True, test_data=X2_test).astype(int)

# CO4 : 0.218
test_CO4 = naive_bayes(X2, CO4, 'CO4', var=3.5, rescale=True, test=True, test_data=X2_test).astype(int)

# CO5 : 0.838
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# CO6 : 0.122
test_CO6 = linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, test=True, test_data=X2_test).astype(int)

In [None]:
# Submission 2
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# CO2 : 0.551
test_CO2 = svm(X1, CO2, 'CO2', kernel='poly', degree=3, uni=15, C=1.0, max_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# CO3 : 0.361
test_CO3 = svm(X2, CO3, 'CO3', pca=25, kernel='poly', degree=2, C=0.8, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# CO4 : 0.231
test_CO4 = ada_boost(X2, CO4, 'CO4', uni=25, n_esti=100, lr=0.1, std_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# CO5 : 0.838
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# CO6 : 0.114
test_CO6 = ada_boost(X2, CO6, 'CO6', n_esti=10, uni=150, std_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

In [None]:
# Submission 3 (Not Good diff from 1st is CO4)
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# CO2 : 0.551
test_CO2 = svm(X1, CO2, 'CO2', kernel='poly', degree=3, uni=15, C=1.0, max_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# CO3 : 0.316
test_CO3 = logistic_regression(X2, CO3, 'CO3', pca=40, test=True, test_data=X2_test).astype(int)

# CO4 : 0.231
test_CO4 = ada_boost(X2, CO4, 'CO4', uni=25, n_esti=100, lr=0.1, std_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# CO5 : 0.838
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# CO6 : 0.122
test_CO6 = linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, test=True, test_data=X2_test).astype(int)

In [None]:
# Submission 4 :[0.358, seed:567, attempt_final2.csv; 0.3804, seed:1, attempt_final4.csv] 
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# MCC: 0.508
test_CO2 = naive_bayes(X1, CO2, 'CO2', uni=20, rescale=True, test=True, test_data=X1_test).astype(int)

# MCC: 0.361
test_CO3 = svm(X2, CO3, 'CO3', pca=25, kernel='poly', degree=2, C=0.8, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.218
test_CO4 = naive_bayes(X2, CO4, 'CO4', var=3.5, rescale=True, test=True, test_data=X2_test).astype(int)

# MCC: 0.832
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.122
test_CO6 = linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, test=True, test_data=X2_test).astype(int)

In [None]:
# Submission 5 : 0.367, seed=1 [attempt_final3.csv] 
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# MCC: 0.508
test_CO2 = naive_bayes(X1, CO2, 'CO2', uni=20, rescale=True, test=True, test_data=X1_test).astype(int)

# MCC: 0.361
test_CO3 = svm(X2, CO3, 'CO3', pca=25, kernel='poly', degree=2, C=0.8, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.231
test_CO4 = ada_boost(X2, CO4, 'CO4', uni=25, n_esti=100, lr=0.1, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.832
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.122
test_CO6 = linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, rescale=True, test=True, test_data=X2_test).astype(int)

In [26]:
# Submission 6: 0.38109 [seed 1, attempt_final5] [FINAL SUBMITTED: 1]
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# MCC: 0.508
test_CO2 = naive_bayes(X1, CO2, 'CO2', uni=20, rescale=True, test=True, test_data=X1_test).astype(int)

# MCC: 0.354
test_CO3 = svm(X2, CO3, 'CO3', uni=35, kernel='rbf', C=0.9, weight={1:2.0, 0:0.7}, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.218
test_CO4 = naive_bayes(X2, CO4, 'CO4', var=3.5, rescale=True, test=True, test_data=X2_test).astype(int)

# MCC: 0.832
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.122
test_CO6 = linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, test=True, test_data=X2_test).astype(int)

SVM MCC: 0.6403346309638315
NaiveBayes MCC: 0.5603272937579981
SVM MCC: 0.5962169403415911
NaiveBayes MCC: 0.43014327602168023
SVM MCC: 0.870609213762706
LDA MCC: 0.38782723005395864


In [28]:
# Submission 7 : 0.367, seed=1 [attempt_final6.csv] [FINAL SUBMITTED: 2]
# CO1 : 0.532
test_CO1 = svm(X1, CO1, 'CO1', kernel='rbf', uni=40, C=0.5, range_scale=True, rescale=False, test=True, test_data=X1_test).astype(int)

# MCC: 0.508
test_CO2 = naive_bayes(X1, CO2, 'CO2', uni=20, rescale=True, test=True, test_data=X1_test).astype(int)

# MCC: 0.354
test_CO3 = svm(X2, CO3, 'CO3', uni=35, kernel='rbf', C=0.9, weight={1:2.0, 0:0.7}, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.231
test_CO4 = ada_boost(X2, CO4, 'CO4', uni=25, n_esti=100, lr=0.1, range_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.832
test_CO5 = svm(X2, CO5, 'CO5', kernel='rbf', C=1.0, uni=10, max_scale=True, rescale=False, test=True, test_data=X2_test).astype(int)

# MCC: 0.122
test_CO6 = linear_discriminant(X2, CO6, 'CO6', uni=10, std_scale=True, rescale=True, test=True, test_data=X2_test).astype(int)

SVM MCC: 0.6403346309638315
NaiveBayes MCC: 0.5603272937579981
SVM MCC: 0.5962169403415911
AdaBoost MCC: 0.7269133380753248
SVM MCC: 0.870609213762706
LDA MCC: 0.38782723005395864


In [29]:
import os

output_dir = '/kaggle/working'
filename = 'attempt_final6.csv'
results = [test_CO1, test_CO2, test_CO3, test_CO4, test_CO5, test_CO6]
with open(os.path.join(output_dir, filename), 'w') as fp:
    cnt = 0
    fp.write(f'Id,Predicted\n')
    for res in results:
        for r in res:
            fp.write(f'{cnt},{r}\n')
            cnt += 1

In [None]:
CO4.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X2, CO6, test_size=0.3)
xtrain, xtest, ytrain, ytest = train_test_split(Xtrain, Ytrain, test_size=0.3)

In [None]:
for e in rates:
    model = GradientBoostingClassifier(learning_rate=e)
    model.fit(xtrain, ytrain)
    
    train_pred = model.predict(xtrain)
    train_result.append(matthews_corrcoef(ytrain, train_pred))
    
    test_pred = model.predict(xtest)
    test_result.append(matthews_corrcoef(ytest, test_pred))


import matplotlib.pyplot as plt

line1 = plt.plot(rates, train_result, 'b', label='Train MCC')
line2 = plt.plot(rates, test_result, 'r', label='Test MCC')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

def search(model, parms):
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X2, CO6, test_size=0.3, random_state=154)
    Xtrain, Xtest = do_univariate(Xtrain, Xtest, Ytrain, 30)

    clf = GridSearchCV(model, parameters, cv=10, scoring='balanced_accuracy')
    clf.fit(Xtrain, Ytrain)
    print(f'Best Params: {clf.best_params_}')
    print(f'Best Score: {clf.best_score_}')
    print(f'Train MCC: {matthews_corrcoef(Ytrain, clf.predict(Xtrain))}')
    print(f'Test MCC: {matthews_corrcoef(Ytest, clf.predict(Xtest))}')
    print(f'Test balance acc: {balanced_accuracy_score(Ytrain, clf.predict(Xtrain))}')
    print(f'Test balance acc: {balanced_accuracy_score(Ytest, clf.predict(Xtest))}')
    return clf

In [None]:
# select parms for Random Forest Algo

parameters = {'n_estimators': [25, 50, 100],
              'max_samples': [0.5, 0.7, 0.99],
              'max_features': [0.2, 0.5, 0.7],
              'max_depth': [2, 5, 10, 15]
             }

model = RandomForestClassifier(criterion='entropy')
clf = search(model, parameters)

In [None]:
# Gradient Boosting

parameters = {'learning_rate': [0.5, 0.75],
              'n_estimators': [50, 150],
              'subsample': [ 0.5, 0.75, 1.0],
              'max_depth': [2, 5],
              'max_features': [0.7, 1.0]
             }
model = GradientBoostingClassifier()
clf = search(model, parameters)

In [None]:
# gb = GradientBoostingClassifier(learning_rate=, n_estimators=, subsample=, max_depth=, max_features=)
ypred = clf.predict(Xtest)
matthews_corrcoef(Ytest, ypred)

In [None]:
rf = RandomForestClassifier(criterion='entropy',max_depth=10, max_features=0.7, max_samples=0.7, n_estimators=100)
rf.fit(Xtrain, Ytrain)
ypred = rf.predict(Xtest)
matthews_corrcoef(Ytest, ypred)