In [6]:
#!/usr/bin/env python

import os
import random
import csv
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import Dataset

random.seed(876542345)
np.random.seed(327674)
os.environ['PYTHONHASHSEED'] = str(56378234)

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import KernelPCA
from sklearn.decomposition import FastICA
from torch.utils.data import Dataset

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern

import numpy as np
from scipy import signal

class spcup23_ds(Dataset):
    """
    Kaggle: 
        > For the binarized labeling, use 0 for schizophrenia (SZ) and 1 for Bipolar (BP).
    """

    def __init__(self, path, transform = None, test = False, seed=2137, shuffle=False):
        """

        """
        self._seed = seed
        self._shuffle = shuffle
        self._transform = transform

        # path to the dataset directory
        if not isinstance(path, Path):
            path = Path(path)
        self._path = path

        # Create container for dataset items
        self._data = []

        # Load dataset into RAM
        if test:
            self._test_ds2ram()
        else:
            self._train_ds2ram()

    def _train_ds2ram(self):
        """
        """
        BP = (1, self._path / "train" / "BP")
        SZ = (0, self._path / "train" / "SZ")

        for label, path in (BP, SZ): # Iterate each class
            for filename in path.glob("sub*"): # Iterate each dir for given class
                # Load files
                folder = str(filename).split("/")[-1]
                fnc = np.load(filename / "fnc.npy").squeeze()
                icn_tc = np.load(filename / "icn_tc.npy").squeeze()

                # Create tuple containing item form dataset
                item = (folder, label, fnc, icn_tc)

                # Push item to the container
                self._data.append(item)
        
        # Deterministic starting point before shuffle
        self._data.sort(key=lambda x : x[0])  
        
        # Shuffle dataset
        if self._shuffle:
            random.Random(self._seed).shuffle(self._data)

    def _test_ds2ram(self):
        """
        """
        path = self._path / "test"

        for filename in path.glob("sub*"): # Iterate each dir for given class
            # Load files
            folder = str(filename).split("/")[-1]
            fnc = np.load(filename / "fnc.npy").squeeze()
            icn_tc = np.load(filename / "icn_tc.npy").squeeze()

            # Create tuple containing item form dataset
            item = (folder, -1, fnc, icn_tc)

            # Push item to the container
            self._data.append(item)
        
        # Deterministic starting point before shuffle
        self._data.sort(key=lambda x : x[0])
        
        # Shuffle dataset
        if self._shuffle:
            random.Random(self._seed).shuffle(self._data)

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        self._n = 0
        return self

    def __next__(self):
        if self._n < len(self):
            result = self[self._n]
            self._n += 1
            return result
        else:
            raise StopIteration

    def __getitem__(self, idx):
        """
        """
        filename, label, fnc, icn_tc = self._data[idx]
        sample = (filename, label, fnc.copy(), icn_tc.copy())
        if self._transform:
            sample = self._transform(sample)

        return sample

if __name__ == "__main__":
    
    # Load datasets
    _DS_PATH = Path("../dataset")
    train_ds = spcup23_ds(_DS_PATH, test=False)
    test_ds = spcup23_ds(_DS_PATH, test=True)
    
    # Prepare
    X, Y = np.zeros((len(train_ds), train_ds[1][2].shape[0])), np.zeros(len(train_ds))
    for i, (filename, label, fnc, icn_tc) in enumerate(train_ds):
        X[i] = fnc
        Y[i] = label

   

    X_test = np.zeros((len(test_ds), test_ds[1][2].shape[0]))
    for i, (filename, label, fnc, icn_tc) in enumerate(test_ds):
        X_test[i] = fnc

    X_tt = np.concatenate((X, X_test))

    if True:
        

      
        pca = KernelPCA(n_components=300, kernel='linear')
        pca.fit(X_tt)
        
        X_tt = pca.transform(X_tt)
        X = pca.transform(X)
        X_test  = pca.transform(X_test)

        
        RF =RandomForestClassifier(n_estimators =120, criterion="entropy", max_depth=6, random_state=42, min_samples_split=6, n_jobs=-1)
       
        feat_selector = BorutaPy(RF, n_estimators='auto', verbose=1, random_state=1, max_iter=1300, perc=43)
        feat_selector.fit(X, Y)
        X = feat_selector.transform(X)
        
        
    clf = SVC(kernel='rbf', C=3e-3, random_state=2792, probability=True,)
    clf.fit(X, Y)
    
    # Create predictions for the whole TRAIN dataset
    ds = train_ds
    with open('44_train.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["ID", "Predicted"])
        for i in tqdm(range(len(ds))):
            filename, label, fnc, icn_tc = ds[i]
            x = pca.transform(fnc[np.newaxis, :])
            # x = ica.transform(x)
            x = feat_selector.transform(x)
            prob = clf.predict_proba(x)[:, 1][0]
            writer.writerow([filename.split("\\")[-1], prob])

    # Create predictions for the whole TEST dataset
    ds = test_ds
    with open('44_test.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["ID", "Predicted"])
        for i in tqdm(range(len(ds))):
            filename, label, fnc, icn_tc = ds[i]
            x = pca.transform(fnc[np.newaxis, :])
            # x = ica.transform(x)
            x = feat_selector.transform(x)
            prob = clf.predict_proba(x)[:, 1][0]
            writer.writerow([filename.split("\\")[-1], prob])

Iteration: 1 / 1300
Iteration: 2 / 1300
Iteration: 3 / 1300
Iteration: 4 / 1300
Iteration: 5 / 1300
Iteration: 6 / 1300
Iteration: 7 / 1300
Iteration: 8 / 1300
Iteration: 9 / 1300
Iteration: 10 / 1300
Iteration: 11 / 1300
Iteration: 12 / 1300
Iteration: 13 / 1300
Iteration: 14 / 1300
Iteration: 15 / 1300
Iteration: 16 / 1300
Iteration: 17 / 1300
Iteration: 18 / 1300
Iteration: 19 / 1300
Iteration: 20 / 1300
Iteration: 21 / 1300
Iteration: 22 / 1300
Iteration: 23 / 1300
Iteration: 24 / 1300
Iteration: 25 / 1300
Iteration: 26 / 1300
Iteration: 27 / 1300
Iteration: 28 / 1300
Iteration: 29 / 1300
Iteration: 30 / 1300
Iteration: 31 / 1300
Iteration: 32 / 1300
Iteration: 33 / 1300
Iteration: 34 / 1300
Iteration: 35 / 1300
Iteration: 36 / 1300
Iteration: 37 / 1300
Iteration: 38 / 1300
Iteration: 39 / 1300
Iteration: 40 / 1300
Iteration: 41 / 1300
Iteration: 42 / 1300
Iteration: 43 / 1300
Iteration: 44 / 1300
Iteration: 45 / 1300
Iteration: 46 / 1300
Iteration: 47 / 1300
Iteration: 48 / 1300
I

  0%|          | 0/471 [00:00<?, ?it/s]

  0%|          | 0/315 [00:00<?, ?it/s]