# Data Pre processing
This notebook does data preprocessing. Starting with null values removal, then class balancing, then removing high correlated features. Furthermore, it also converts data into 2-d array format for CNNs

Dataset link:
https://figshare.com/articles/Android_malware_dataset_for_machine_learning_2/5854653/1

In [None]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
# 1 = malware
# 0 benign
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


# Load the TensorBoard notebook extension
#%load_ext tensorboard

In [None]:
df = pd.read_csv(r"dataset\cleaned-data.csv", sep= ";")

In [None]:
df.head(5)

In [None]:
print("Check for null values...")
print(df.shape)
print(df.isnull().sum().tolist())

print("drop null values...")
df = df.dropna()
print(df.shape)
print(df.isnull().sum().tolist())

In [None]:
df['class'].value_counts()

In [None]:
# use resample method from scikit-learn to sample the data for class balancing
from sklearn.utils import resample
data = df
# Separate majority and minority classes for undersampling
df_class1  = data.loc[data['class'] == 1]
df_class0   = data[data['class'] == 0]
#df.loc[df['A'] == 'foo']
df_class1 = resample(df_class1, 
                                             replace=True,    # sample with replacement
                                             n_samples=5555,     # to match number of values in each class
                                             random_state=123) # reproducible results

df_class0    = resample(df_class0, 
                                             replace=True,    # sample with replacement
                                             n_samples=5555,     # to match number of values in each class
                                             random_state=123) # reproducible results

# Combine all the class with equal number of values
data = pd.concat([ df_class1 , df_class0 ])
 

# check for the class balance
print(data['class'].value_counts())
# change df with sampled one

print("\n")
print(data.shape)
print(data.head(5))


In [None]:
labels = data['class']
data.drop(['class'] , axis = 1, inplace = True)
labels.value_counts()

In [None]:
data.shape

In [None]:
# check for feature correlation
corr = data.corr()
plt.figure(figsize=(16, 16))

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

# check for feature correlation
corr = data.iloc[ : , 1:50 ].corr()
plt.figure(figsize=(16, 16))

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Create correlation matrix
corr_matrix = data.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.545 )  ]
new_data = data.drop(data[to_drop], axis=1, inplace=False)
print("Data shape = ")
print(data.shape)
print("New data shape = ")
print(new_data.shape)

In [None]:
data = new_data
wdth = 12
hight = 12

In [None]:
print("After dropping correlated features ")
# check for feature correlation
corr = data.corr()
plt.figure(figsize=(16, 16))

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

# check for feature correlation
corr = data.iloc[ : , 1:50 ].corr()
plt.figure(figsize=(16, 16))

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
cols= data.columns
for col in cols:
    print(data[col].value_counts())

In [None]:
filepath = r"dataset\train_data_straight_rows.csv"     
joblib.dump(data ,filepath )

# load model for prediction
#data = joblib.load(filepath)

In [None]:
filepath = r"dataset\labels.csv"     
joblib.dump(labels ,filepath )

# load model for prediction
#data = joblib.load(filepath)

In [None]:
train_data = []

for index, row in data.iterrows():
    row = np.array(row)
    row = row.reshape(wdth,hight)
    train_data.append(row)

len(train_data)

In [None]:
train_data

In [None]:
X = []
y = labels

for features in train_data:
    X.append(features)
    
X = np.array(X).reshape(-1, wdth,hight, 1)


In [None]:
X_siamese = np.array(X).reshape(-1, wdth,hight)
X_siamese.shape


In [None]:
# save this data format for later use
     
joblib.dump(X_siamese ,r"dataset\X_siamese-11110-12-12-1" )
joblib.dump(labels ,r"dataset\labels-11110" )



Building CNN

In [None]:
print(X.shape[1:])
print(y.shape)

In [None]:
joblib.dump(X ,r"dataset\X_for_cnn" )
joblib.dump(y,r"dataset\y" )
