# Import Packages

In [1]:
from packages import *
%matplotlib inline

# Read Dataset

In [2]:
df = pd.read_csv('data/Wavelet_features.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,label
0,2.944439,8,2,0.001317,0.006354,0.999421,0.467338,0.662373,0.438738,2.944439,...,6.688355,587,587,-0.018317,3.1e-05,0.017297,5.963507e-05,0.030143,0.0009086033,0
1,2.944439,13,1,-0.045173,-0.001142,0.000642,-0.019399,0.030064,0.000904,2.944439,...,6.688355,486,486,-0.000264,1.9e-05,0.000277,5.867482e-07,0.000674,4.546379e-07,1
2,2.944439,10,1,-1.34284,-0.010283,0.000819,-0.60067,0.928463,0.862043,2.944439,...,6.688355,602,602,-0.003761,2.6e-05,0.003804,-1.86727e-05,0.006238,3.891848e-05,0
3,2.944439,10,1,-0.015038,0.021864,1.563694,0.741711,1.070447,1.145858,2.944439,...,6.688355,448,446,-0.01376,-0.000268,0.012652,-3.762953e-05,0.025086,0.0006293154,0
4,2.944439,12,2,-0.242719,-0.011091,0.002345,-0.095093,0.12894,0.016626,2.944439,...,6.688355,529,529,-0.00794,0.000602,0.008097,-2.656144e-06,0.013116,0.0001720174,1


In [3]:
# check for any missing values
df.isnull().values.any()

False

In [4]:
# check if any duplicate rows
df.shape,df.drop_duplicates().shape

((1494, 73), (1492, 73))

# Train-test split

In [5]:
train, test = train_test_split(df, test_size=0.2, random_state = 42, stratify = df['label'])

In [6]:
train['label'].value_counts()

1    644
0    551
Name: label, dtype: int64

In [7]:
train['label'].value_counts()/train.shape[0]

1    0.538912
0    0.461088
Name: label, dtype: float64

In [8]:
test['label'].value_counts()

1    161
0    138
Name: label, dtype: int64

In [9]:
test['label'].value_counts()/test.shape[0]

1    0.538462
0    0.461538
Name: label, dtype: float64

In [10]:
X_train = train.iloc[:,:-1].reset_index(drop=True)
y_train = train.iloc[:,-1].reset_index(drop=True)

X_test = test.iloc[:,:-1].reset_index(drop=True)
y_test = test.iloc[:,-1].reset_index(drop=True)

In [11]:
features = pd.DataFrame(list(X_train.columns.values))

In [12]:
len(features)

72

In [13]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,2.944439,7,1,-0.077992,-0.000328,0.000319,-0.036379,0.058748,0.003451,2.944439,...,4e-06,6.688355,462,460,-0.000394,-8e-06,0.000371,-3e-06,0.000828,6.849061e-07
1,2.944439,9,1,-0.001595,0.001238,0.313595,0.124417,0.307404,0.094497,2.944439,...,0.010545,6.688355,497,497,-0.025454,0.000767,0.025156,-1.8e-05,0.037152,0.00138027
2,2.944439,10,1,-0.001734,0.005142,0.036282,0.012328,0.018987,0.00036,2.944439,...,0.000214,6.688355,615,615,-0.001735,-3.9e-05,0.001816,-4e-06,0.0041,1.681127e-05
3,2.944439,5,1,-0.00625,0.000207,0.315815,0.084383,0.282924,0.080046,2.944439,...,0.004197,6.688355,453,453,-0.010819,0.000106,0.011225,-1.1e-05,0.019052,0.0003629776
4,2.944439,8,1,-2.803438,-0.008478,0.007851,-1.256433,1.960366,3.843035,2.944439,...,0.009636,6.688355,600,600,-0.027457,-0.000592,0.027881,6.5e-05,0.052705,0.002777863


In [14]:
X_train.shape

(1195, 72)

# Filter Methods for Feature Selection

### 1. Pearson's Correlation

In [15]:
# Create correlation matrix
corr_matrix = X_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop features 
X_train = X_train.drop(to_drop, axis=1)
X_test = X_test.drop(to_drop, axis=1)

print(len(features)-len(to_drop), 'features selected')

44 features selected


In [16]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,10,11,...,49,51,55,57,58,60,64,66,67,69
0,2.944439,7,1,-0.077992,-0.000328,0.000319,-0.036379,0.058748,7,2,...,-9e-06,-6.6e-05,240,-0.000808,-3.1e-05,-2.2e-05,462,-0.000394,-8e-06,-3e-06
1,2.944439,9,1,-0.001595,0.001238,0.313595,0.124417,0.307404,9,8,...,0.008028,0.001701,281,-0.065152,0.000881,0.001589,497,-0.025454,0.000767,-1.8e-05
2,2.944439,10,1,-0.001734,0.005142,0.036282,0.012328,0.018987,9,8,...,0.002585,-5.8e-05,230,-0.005999,-0.000389,-6e-05,615,-0.001735,-3.9e-05,-4e-06
3,2.944439,5,1,-0.00625,0.000207,0.315815,0.084383,0.282924,13,8,...,0.005752,0.002153,332,-0.042558,0.003102,0.000479,453,-0.010819,0.000106,-1.1e-05
4,2.944439,8,1,-2.803438,-0.008478,0.007851,-1.256433,1.960366,12,4,...,-0.00895,0.000728,220,-0.061601,0.001126,0.001962,600,-0.027457,-0.000592,6.5e-05


In [17]:
X_train.shape

(1195, 44)

### 2. Variance Threshold

In [18]:
selector = VarianceThreshold(threshold=0.01).fit(X_train) # variance 1%

f = X_train.columns[selector.get_support(indices=True)].tolist()
print(len(f), 'features selected')

21 features selected


In [19]:
# Renew Dataset
X_train = pd.DataFrame(selector.transform(X_train),columns=f)
X_test = pd.DataFrame(selector.transform(X_test),columns=f)

In [20]:
X_train.shape

(1195, 21)

In [21]:
X_train.head()

Unnamed: 0,1,2,3,5,6,7,10,11,19,20,...,23,25,28,30,32,34,37,46,55,64
0,7.0,1.0,-0.077992,0.000319,-0.036379,0.058748,7.0,2.0,26.0,20.0,...,0.008464,0.016168,30.0,-0.005274,0.006649,0.013018,82.0,100.0,240.0,462.0
1,9.0,1.0,-0.001595,0.313595,0.124417,0.307404,9.0,8.0,7.0,8.0,...,0.123918,0.192525,31.0,-0.082278,0.07872,0.165879,43.0,95.0,281.0,497.0
2,10.0,1.0,-0.001734,0.036282,0.012328,0.018987,9.0,8.0,13.0,12.0,...,0.008577,0.058149,29.0,-0.019997,0.020463,0.042823,79.0,105.0,230.0,615.0
3,5.0,1.0,-0.00625,0.315815,0.084383,0.282924,13.0,8.0,11.0,4.0,...,0.067482,0.124041,20.0,-0.181978,0.057379,0.641685,82.0,65.0,332.0,453.0
4,8.0,1.0,-2.803438,0.007851,-1.256433,1.960366,12.0,4.0,10.0,6.0,...,0.039321,0.497786,44.0,-0.195744,0.202545,0.489706,69.0,106.0,220.0,600.0


# Drop duplicate rows

In [22]:
train2 = X_train.copy()
train2['label'] = y_train
print(train2.shape)
train2 = train2.drop_duplicates().reset_index(drop=True)
print(train2.shape)

(1195, 22)
(1194, 22)


In [23]:
test2 = X_test.copy()
test2['label'] = y_test
print(test2.shape)
test2 = test2.drop_duplicates().reset_index(drop=True)
print(test2.shape)

(299, 22)
(299, 22)


In [24]:
X_train = train2.iloc[:,:-1].reset_index(drop=True)
y_train = train2.iloc[:,-1].reset_index(drop=True)

X_test = test2.iloc[:,:-1].reset_index(drop=True)
y_test = test2.iloc[:,-1].reset_index(drop=True)

# Save data

In [25]:
pd.concat([X_train,y_train],axis=1).to_csv('data/Wavelet_train_data.csv',index=False)
test2.to_csv('data/Wavelet_test_data.csv',index=False)