In [153]:
import os
import sys
from functools import reduce
from collections import deque
from multiprocessing import Pool
import numpy as np
from numpy import unicode
from sklearn.base import BaseEstimator, TransformerMixin


_PY2 = sys.version_info.major == 2
_STRING_TYPES = (str, unicode) if _PY2 else (str,)


class Relief(BaseEstimator, TransformerMixin):
  
    def __init__(self, **kwargs):
        
        kwargs = dict(kwargs)
        self.w_ = None

        def gen_random_state(rnd_state):
            """Generate random state instance"""
            if isinstance(rnd_state, np.random.RandomState):
                return rnd_state

            return np.random.RandomState(seed=rnd_state)

        for name, default_value, convf in (
                # Param name, default param value, param conversion function
                ('categorical', (), tuple),
                ('n_jobs', os.cpu_count(), int),
                ('n_iterations', 100, int),
                ('n_features', 1, int),
                ('random_state', None, gen_random_state)
        ):
            setattr(self, name, convf(kwargs.setdefault(name, default_value)))
            del kwargs[name]

        if self.n_jobs < 1:
            raise ValueError('n_jobs must be greater than 0')

        if kwargs:
            raise ValueError('Invalid arguments: %s' % ', '.join(kwargs))

    def fit(self, data, y):
       
        n, m = data.shape # Number of instances & features

        # Initialise state
        js = self.random_state.randint(n, size=self.n_iterations)

        # Compute weights
        if self.n_jobs > 1:
            results = deque()
            n_iterations = [int(np.floor(self.n_iterations / self.n_jobs))] * self.n_jobs
            n_iterations[-1] += int(np.floor(self.n_iterations % self.n_jobs))

            with Pool(processes=self.n_jobs) as pool:
                for n_iter, n_proc_iters in enumerate(n_iterations):
                    results.append(
                        pool.apply_async(
                            self._fit_iteration,
                            (data, y, n_iter * n_iterations[0], n_proc_iters, js)
                        )
                    )

                pool.close()
                pool.join()

            self.w_ = reduce(
                lambda a, res: a + res.get(),
                results,
                np.array([0.] * m)
            )
        else:
            self.w_ = self._fit_iteration(data, y, 0, self.n_iterations, js)

        self.w_ /= self.n_iterations

        return self

    def _fit_iteration(self, data, y, iter_offset, n_iters, js):
        w = np.array([0.] * data.shape[1])

        for i in range(iter_offset, n_iters + iter_offset):
            j = js[i]
            ri = data[j] # Random sample instance
            hit, miss = self._nn(data, y, j)

            w += np.array([
                self._diff(k, ri[k], miss[k])
                - self._diff(k, ri[k], hit[k])
                for k in range(data.shape[1])
            ])

        return w

    def _nn(self, data, y, j):
      
        ri = data[j]
        d = np.sum(
            np.array([
                self._diff(c, ri[c], data[:, c]) for c in range(len(ri))
            ]).T,
            axis=1
        )

        odata = data[d.argsort()]
        oy = y[d.argsort()]

        h = odata[oy == y[j]][0:1]
        m = odata[oy != y[j]][0]

        h = h[1] if h.shape[0] > 1 else h[0]

        return h, m

    def _diff(self, c, a1, a2):
       
        return (
            np.abs(a1 - a2) if c not in self.categorical
            else 1 - (a1 == a2)
        )

    def transform(self, data):
       
        n_features = np.round(
            data.shape[1] * self.n_features
        ).astype(np.int16) if self.n_features < 1 else self.n_features
        feat_indices = np.flip(np.argsort(self.w_), 0)[0:n_features]

        return data[:, feat_indices]

In [154]:
import pandas as pd
data_1 = pd.read_csv('Failure.csv',index_col=[0])

In [155]:
data_1

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw,Unnamed: 0.1
0,2017-12-25,Z3014P0L,ST4000DM000,4000787030016,1,117.0,155354624.0,0.0,0.0,91.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,2017-12-25,ZCH03K2S,ST12000NM0007,12000138625024,1,83.0,214770416.0,0.0,0.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,2017-11-02,Z3010CGY,ST4000DM000,4000787030016,1,118.0,172329256.0,0.0,0.0,93.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,2017-11-02,Z304JD17,ST4000DM000,4000787030016,1,110.0,28874688.0,0.0,0.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,2017-12-21,Z304KBFE,ST4000DM000,4000787030016,1,118.0,179188832.0,0.0,0.0,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,2017-09-22,Z1Z059HA,ST4000DX000,4000787030016,1,120.0,11488.0,0.0,0.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1554,2017-09-22,Z304L98W,ST4000DM000,4000787030016,1,112.0,46862432.0,0.0,0.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1555,2017-09-22,ZA13YP7Z,ST8000DM002,8001563222016,1,73.0,221910544.0,0.0,0.0,93.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1556,2017-03-31,S300WEE9,ST4000DM000,4000787030016,1,118.0,184473400.0,0.0,0.0,97.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0


In [156]:
data_0 = pd.read_csv('./data_Q1_2017/2017-02-07.csv')

In [157]:
data_0

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2017-02-07,MJ0351YNG9Z0XA,Hitachi HDS5C3030ALA630,3000592982016,0,100,0,135.0,108.0,127,...,,,,,,,,,,
1,2017-02-07,MJ0351YNG9WJSA,Hitachi HDS5C3030ALA630,3000592982016,0,100,0,136.0,104.0,126,...,,,,,,,,,,
2,2017-02-07,PL1321LAG34XWH,Hitachi HDS5C4040ALE630,4000787030016,0,100,0,134.0,101.0,130,...,,,,,,,,,,
3,2017-02-07,MJ0351YNGABYAA,Hitachi HDS5C3030ALA630,3000592982016,0,100,0,136.0,104.0,137,...,,,,,,,,,,
4,2017-02-07,Z305B2QN,ST4000DM000,4000787030016,0,119,211777272,,,91,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74929,2017-02-07,PL1331LAHD1AWH,HGST HMS5C4040BLE640,4000787030016,0,100,0,134.0,100.0,100,...,,,,,,,,,,
74930,2017-02-07,Z3016V0V,ST4000DM000,4000787030016,0,119,212364800,,,91,...,,,,,,,,,,
74931,2017-02-07,MJ1323YNG1LZJC,Hitachi HDS5C3030ALA630,3000592982016,0,100,0,100.0,0.0,124,...,,,,,,,,,,
74932,2017-02-07,S300YQXW,ST4000DM000,4000787030016,0,119,230749960,,,96,...,,,,,,,,,,


In [158]:
for i in range(160):
    data_1 = data_1.append(data_0.iloc[100*i:100*i+10],ignore_index=True)

In [159]:
len(data_1[data_1['failure']==0])

1600

In [160]:
data = data_1.fillna(0)

In [161]:
r = Relief(n_features=16,n_iterations = len(data_1))

In [162]:
 from sklearn.preprocessing import MaxAbsScaler
max_abs_scaler = MaxAbsScaler()

In [163]:
my_input_matrix = max_abs_scaler.fit_transform(np.array(data.drop(['failure','date','serial_number','model','capacity_bytes'], axis=1).values))

In [164]:
my_label_vector = np.array(data['failure'])

In [165]:
r.fit(my_input_matrix,my_label_vector)

Relief()

In [166]:
z = list(zip(r.w_,range(95)))

In [167]:
s_z = sorted(z,key=lambda x:x[0], reverse = True)

In [168]:
z = [i[1] for i in (s_z[:16])]

In [169]:
z

[15, 77, 48, 47, 32, 14, 1, 36, 44, 49, 39, 45, 4, 0, 46, 75]

In [170]:
for i in z:
    print(data.drop(['failure','date','serial_number','model','capacity_bytes'], axis=1).columns[i])

smart_9_raw
smart_241_raw
smart_195_normalized
smart_194_raw
smart_187_normalized
smart_9_normalized
smart_1_raw
smart_189_normalized
smart_193_normalized
smart_195_raw
smart_190_raw
smart_193_raw
smart_3_normalized
smart_1_normalized
smart_194_normalized
smart_240_raw


In [85]:
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler


q_4 = np.zeros(90)
file_dir = "./data_Q4_2017/"  # file directory
all_csv_list = os.listdir(file_dir)  # get csv list
for single_csv in all_csv_list:
    if single_csv.find(".csv", 0, len(single_csv)) != -1:
        print(os.path.join(file_dir, single_csv))
        data = pd.read_csv(os.path.join(file_dir, single_csv)).fillna(0)
        max_abs_scaler = MaxAbsScaler()
        my_input_matrix = max_abs_scaler.fit_transform(np.array(data.drop(['failure','date','serial_number','model','capacity_bytes'], axis=1).values))
        my_label_vector = np.array(data['failure'])
        r = Relief(n_features=16,n_iterations = 100)
        r.fit(my_input_matrix, my_label_vector)
        q_4 += r.w_

./data_Q4_2017/2017-12-25.csv
./data_Q4_2017/2017-11-02.csv
./data_Q4_2017/2017-12-21.csv
./data_Q4_2017/2017-10-14.csv
./data_Q4_2017/2017-10-26.csv
./data_Q4_2017/2017-12-18.csv
./data_Q4_2017/2017-12-20.csv
./data_Q4_2017/2017-10-06.csv
./data_Q4_2017/2017-11-17.csv
./data_Q4_2017/2017-12-05.csv
./data_Q4_2017/2017-10-28.csv
./data_Q4_2017/2017-11-27.csv
./data_Q4_2017/2017-10-18.csv
./data_Q4_2017/2017-12-29.csv
./data_Q4_2017/2017-10-12.csv
./data_Q4_2017/2017-10-31.csv
./data_Q4_2017/2017-10-27.csv
./data_Q4_2017/2017-12-24.csv
./data_Q4_2017/2017-12-02.csv
./data_Q4_2017/2017-11-08.csv
./data_Q4_2017/2017-12-17.csv
./data_Q4_2017/2017-11-12.csv
./data_Q4_2017/2017-12-13.csv
./data_Q4_2017/2017-11-24.csv
./data_Q4_2017/2017-11-05.csv
./data_Q4_2017/2017-10-15.csv
./data_Q4_2017/2017-10-29.csv
./data_Q4_2017/2017-12-23.csv
./data_Q4_2017/2017-12-15.csv
./data_Q4_2017/2017-11-09.csv
./data_Q4_2017/2017-10-03.csv
./data_Q4_2017/2017-10-04.csv
./data_Q4_2017/2017-10-02.csv
./data_Q4_