In [1]:
from model import *
from data_preprocessing import *
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
def test_on_folds(model):
    with open('folds.pkl', 'rb') as f:
        folds = pickle.load(f)
    
    accuracy = 0
    for train, test in zip(
        [folds[:i] + folds[i + 1:] for i in range(len(folds))],
        folds
    ):
        train = functools.reduce(lambda x, y: x + y, train)
        model.train(train)
        accuracy += model.predict_test(test)
        print(accuracy)
    
    print(f'Accuracy: {accuracy / 5}')

In [3]:
def test_on_test_train(model):
    with open('folds.pkl', 'rb') as f:
        folds = pickle.load(f)
    
    accuracy = 0
    for train, test in zip(
        [folds[:i] + folds[i + 1:] for i in range(len(folds))],
        folds
    ):
        train = functools.reduce(lambda x, y: x + y, train)
        model.train(train)
        accuracy += model.predict_test(test)
        print(accuracy)
    
    print(f'Accuracy: {accuracy / 5}')

In [4]:
class Model:
    labels = {
        'ABSZ': 30, 
        'CPSZ': 8, 
        'FNSZ': 3, 
        'GNSZ': 7, 
        'SPSZ': 67, 
        'TCSZ': 60, 
        'TNSZ': 44
    }
    
    def __init__(self, data_dir, model, sklearn_=False, pytorch_=True, is_cuda=False):
        self.dir = data_dir
        self.model = model
        self.is_cuda = is_cuda
        self.have_label = True
        if pytorch_:
            self.pytorch = True
            self.sklearn = False
        else:
            self.pytorch = False
            self.sklearn = True
            
    def read_data(self, files, dir_new, df=False):
        cnt = 0
        if os.path.isdir(dir_new):
            shutil.rmtree(dir_new)
            os.mkdir(dir_new)
        else:
            os.mkdir(dir_new)
            
        if df:
            df_array_flag = True
            lst_y = []
        
        for file in tqdm_notebook(files):
            with open(f'{self.dir}/{file}', 'rb') as f:
                data = pickle.load(f)
                data, label = data[1], data[0]
                if label not in self.labels.keys():
                    continue
                
                np.random.shuffle(data)
                while len(data) < self.labels[label]:
                    data = np.concatenate([data, data])
                
                data = data[:self.labels[label]]
                
                if df and df_array_flag:
                    df_array_flag = False
                    df_array = data
                    lst_y.extend([labels_2_num[label]] * self.labels[label])
                elif df:
                    df_array = np.concatenate([df_array, data])
                    lst_y.extend([labels_2_num[label]] * self.labels[label])
                
                for i in range(len(data)):
                    with open(f'{dir_new}/seiz_{cnt}.pkl', 'wb') as f:
                        pickle.dump(data_tuple(label, data[i]), f)
                        cnt += 1
        if df:
            df_array = np.concatenate([df_array, np.array(lst_y).reshape(-1, 1)], axis=1)
            return pd.DataFrame(df_array, columns=np.arange(901))
        
    def read_data_stds(self, files, dir_new):
        df_array_flag = True
        lst_y = []
        
        for file in tqdm_notebook(files):
            with open(f'{self.dir}/{file}', 'rb') as f:
                data = pickle.load(f)
                data, label = data[1], data[0]
                if label not in self.labels.keys():
                    continue
                
                data = np.array([
                    *list(np.std(data, axis=0)),
                    *list(np.mean(data, axis=0)),
                    *list(np.max(data, axis=0)),
                    *list(np.min(data, axis=0)),
                ]).reshape(1, -1)
                
                if df_array_flag:
                    df_array_flag = False
                    df_array = data
                    lst_y.append(labels_2_num[label])
                else:
                    df_array = np.concatenate([df_array, data], axis=0)
                    lst_y.append(labels_2_num[label])
        
        df_array = np.concatenate([df_array, np.array(lst_y).reshape(-1, 1)], axis=1)
        return pd.DataFrame(df_array, columns=np.arange(df_array.shape[1]))
                    
    
    def train(self, train):
        print('Starting prepare data...')
        if self.sklearn:
            self.df = self.read_data_stds(files=train, dir_new=f'{self.dir}_')
            # self.df = self.read_data(files=train, dir_new=f'{self.dir}_', df=True)
        else:
            self.dir_new = f'{self.dir}_'
            self.read_data(files=train, dir_new=self.dir_new)
        print('Preparing data finished. Starting train model...')
        
        if self.sklearn:
            self.df.iloc[:, :3600] = MinMaxScaler().fit_transform(self.df.iloc[:, :3600])
            self.model.fit(self.df.iloc[:, :3600], self.df.iloc[:, 3600])
        else:
            dataset_ = Dataset(self.dir_new)
            self.dataloader_ = data_utils.DataLoader(
                dataset=dataset_, 
                batch_size=128,
                shuffle=True,
            )
            if self.is_cuda:
                self.model = self.model.float().cuda()

            loss_fn = nn.CrossEntropyLoss()
            optimizer_ft = optim.Adam(self.model.parameters(), lr=0.001)
            exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=17, gamma=0.1)

            self.model, losses = train_model(self.model, loss_fn, optimizer_ft, self.dataloader_, is_cuda=self.is_cuda, num_epochs=15)
            
        print('Model training finished.')
    
    def predict(self, test, ret_true=False):
        y_true = []
        y_pred = []
        if self.sklearn:
            for file in test:
                with open(f'{self.dir}/{file}', 'rb') as f:
                    data = pickle.load(f)
                y_true.append(labels_2_num[data[0]])
                
                data = data[1]
                np.random.shuffle(data)
                array = np.array(data[:128])
                pred = self.model.predict(MinMaxScaler().fit_transform(array))
                pred = np.array(list(map(int, list(pred))))
                counts = np.bincount(pred)
                
                y_pred.append(np.argmax(counts))
        else:
            for file in test:
                with open(f'{self.dir}/{file}', 'rb') as f:
                    data = pickle.load(f)
                y_true.append(labels_2_num[data[0]])
                
                tensor = torch.Tensor(data[1][:128])
                if self.is_cuda:
                    tensor = tensor.cuda()
                pred = self.model(tensor)
                pred = pred.std(dim=0)
                
                y_pred.append(pred.argmax().cpu().tolist())
                
                
        if ret_true:
            return y_pred, y_true
        else:
            return y_pred
        
    def predict_stds(self, test, ret_true=False):
        if self.sklearn:
            y_true = []
            y_pred = []
            flag = True
            
            for file in test:
                with open(f'{self.dir}/{file}', 'rb') as f:
                    data = pickle.load(f)
                if self.have_label:
                    y_true.append(labels_2_num[data[0]])
                else:
                    ret_true = False
                    ret_letters = True
                # data = data[1]                
                data = np.array([
                    *list(np.std(data, axis=0)),
                    *list(np.mean(data, axis=0)),
                    *list(np.max(data, axis=0)),
                    *list(np.min(data, axis=0)),
                ]).reshape(1, -1)
                if flag:
                    df_array = data
                    flag = False
                else:
                    df_array = np.concatenate([df_array, data])
                
                    
            y_pred = self.model.predict(MinMaxScaler().fit_transform(df_array))
        else:
            y_true = []
            y_pred = []
            
            for file in test:
                with open(f'{self.dir}/{file}', 'rb') as f:
                    data = pickle.load(f)
                y_true.append(labels_2_num[data[0]])
                
                tensor = torch.Tensor(data[1][:128])
                if self.is_cuda:
                    tensor = tensor.cuda()
                pred = self.model(tensor)
                pred = pred.std(dim=0)
                
                y_pred.append(pred.argmax().cpu().tolist())
                
                
        if ret_true:
            return y_pred, y_true
        elif ret_letters:
            y_pred_cls = []
            for i, y_ in enumerate(y_pred):
                y_pred_cls.append(labels[int(y_)])
            return y_pred_cls
        return y_pred
    
    # def predict_test(self, test):
    #     pred, true = self.predict_stds(test, ret_true=True)
    #     from sklearn.metrics import f1_score
    #     return f1_score(true, pred, average='weighted')
    
    def predict_test(self, test, have_label=False):
        self.have_label = have_label
        if self.have_label:
            pred, true = self.predict_stds(test, ret_true=True)
            from sklearn.metrics import f1_score
            return f1_score(true, pred, average='weighted')
        else:
            predict = self.predict_stds(test)
            return predict

In [0]:
  import warnings
warnings.filterwarnings('ignore')

In [6]:
from sklearn.neighbors import KNeighborsClassifier

### Make submission

In [9]:
files_list = [f'test{i}.pkl' for i in range(327)]
model = Model(data_dir='./pp2', model=KNeighborsClassifier(n_neighbors=5), pytorch_=False,)
model.train([f'seiz_{i}.pkl' for i in range(1, 1881)])
prediction = model.predict_test(files_list, have_label=False)
files_list = " ".join(files_list).replace(".pkl", "")
files_list = files_list.split()
final_df = pd.DataFrame({'id': files_list, 'label': prediction}, index=None)
final_df.to_csv('./predicted.csv', index=False)

Starting prepare data...


HBox(children=(IntProgress(value=0, max=1880), HTML(value='')))


Preparing data finished. Starting train model...
Model training finished.


In [10]:
final_df['label'].value_counts()

FNSZ    164
GNSZ     73
CPSZ     54
TNSZ     14
SPSZ     11
ABSZ      7
TCSZ      4
Name: label, dtype: int64