In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import *
plt.style.use('ggplot')

In [2]:
import pandas as pd

def process_data(data_paths):
    rows = []
    for data_idx, data_path in enumerate(data_paths):
        data = np.load(data_path, allow_pickle=True)
        for event in data:
            features = build_features(event)
            rows.append({'label': data_idx, **features})
    
    return pd.DataFrame(rows)

In [4]:
polymer_df = process_data(['../data/AA66266AA.npy', '../data/AA66466AA.npy', '../data/AA66566AA.npy'])

In [5]:
polymer_df

Unnamed: 0,label,num_signals,duration,max_current,min_current,mean_current,std_current,num_extrema,mean_extrema,std_extrema,mean_extrema_diff
0,0,425,4.32,56.370224,16.413359,34.788353,9.069802,31,18.900551,18.616003,15.244028
1,0,956,9.65,67.736397,42.550556,55.255898,4.654985,79,30.133924,25.596399,8.365108
2,0,1485,14.91,65.653435,40.036980,50.781509,4.121673,106,29.375677,22.110746,8.004821
3,0,109,1.14,55.258640,28.029890,41.888321,8.434102,11,21.684065,22.254078,13.589338
4,0,137,1.50,41.879601,19.437662,31.943779,5.805984,15,16.416134,16.347281,9.743867
...,...,...,...,...,...,...,...,...,...,...,...
140148,2,1933,19.42,72.652290,23.673409,49.773766,7.695426,159,29.680387,21.369534,12.854592
140149,2,682,6.92,65.707520,21.862839,48.241112,9.603276,51,26.470976,24.333487,16.832687
140150,2,127,1.33,62.715408,38.863491,52.008274,5.030526,13,26.653801,26.358740,10.330729
140151,2,88,0.99,66.379051,32.530750,53.133121,8.808292,9,27.936756,28.462580,14.161543


In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

label = 'label'
features = list(set(polymer_df.columns) - set([label]))

train_df, test_df = train_test_split(polymer_df, test_size=0.3, shuffle=True, random_state=42)

model = SVC()
model.fit(train_df[features], train_df[label])

preds = model.predict(test_df[features])
accuracy_score(test_df[label], preds), f1_score(test_df[label], preds)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

label = 'label'
features = list(set(polymer_df.columns) - set([label]))

train_df, test_df = train_test_split(polymer_df, test_size=0.3, shuffle=True, random_state=42)

model = RandomForestClassifier(class_weight='balanced', min_samples_leaf=4)
model.fit(train_df[features], train_df[label])

preds = model.predict(test_df[features])
accuracy_score(test_df[label], preds), f1_score(test_df[label], preds, average='weighted')

(0.7147885649051039, 0.7177447359911406)

In [7]:
preds = model.predict(train_df[features])
accuracy_score(train_df[label], preds), f1_score(train_df[label], preds, average='weighted')

(0.9999796140948148, 0.9999796140142967)