### Dataset Spliting
- Splits the dataset, with equal class distribution
- __All_Infogain_80.csv:__ contains 80% of the samples from the dataset
- __All_Infogain_20.csv:__ contains 20% of the samples from the dataset 

- Set to `true` if runnig in Colab

In [11]:
RUNNING_IN_COLAB = False

### Import google drive

In [12]:
if RUNNING_IN_COLAB:

    from google.colab import drive

    #Mount drive
    drive.mount('/content/drive', force_remount=True)

### Import modules

In [13]:
import pandas

from sklearn.model_selection import train_test_split

### Reading dataset CSV

In [14]:
if RUNNING_IN_COLAB:
    CSV_PATH = '/content/drive/My Drive/Colab Notebooks/DataScience/All_Infogain.csv'
else:   
    CSV_PATH = '../../dataset/FinalDataset/All_Infogain.csv'

CLASS_LABEL_COLUMN = 'class'

# Reads CSV file with data
df = pandas.read_csv(CSV_PATH, index_col=0)

#Remove values with 'NaN'
df.dropna(inplace=True)

#Resets index 
df = df.reset_index()

df.head()

Unnamed: 0,avgpathtokenlen,pathurlRatio,ArgUrlRatio,argDomanRatio,domainUrlRatio,pathDomainRatio,argPathRatio,CharacterContinuityRate,NumberRate_URL,NumberRate_FileName,NumberRate_AfterPath,Entropy_Domain,class
0,105.0,0.876,0.008,0.087,0.095,9.174,0.01,0.435,0.199,0.219,-1.0,0.904,phishing
1,66.0,0.874,0.009,0.091,0.096,9.136,0.01,0.682,0.257,0.293,-1.0,0.87,phishing
2,66.0,0.874,0.009,0.091,0.096,9.136,0.01,0.682,0.0,0.0,-1.0,0.87,phishing
3,66.0,0.874,0.009,0.091,0.096,9.136,0.01,0.682,0.0,0.0,-1.0,0.87,phishing
4,65.0,0.948,0.006,0.182,0.032,30.0,0.006,0.727,0.141,0.141,-1.0,1.0,benign


### Split data into X and Y

In [15]:
#X is a matrix, with all the samples feature array
X = df.iloc[:, :-1].values
print(X)

print('\n')

#Y is the class of the samples
Y = df.iloc[:, -1].values
print(Y)

[[ 1.05e+02  8.76e-01  8.00e-03 ...  2.19e-01 -1.00e+00  9.04e-01]
 [ 6.60e+01  8.74e-01  9.00e-03 ...  2.93e-01 -1.00e+00  8.70e-01]
 [ 6.60e+01  8.74e-01  9.00e-03 ...  0.00e+00 -1.00e+00  8.70e-01]
 ...
 [ 1.00e+00  9.80e-02  4.90e-02 ...  0.00e+00 -1.00e+00  7.29e-01]
 [ 1.00e+00  1.05e-01  5.30e-02 ...  0.00e+00 -1.00e+00  7.14e-01]
 [ 6.67e-01  1.25e-01  5.00e-02 ...  0.00e+00 -1.00e+00  8.61e-01]]


['phishing' 'phishing' 'phishing' ... 'phishing' 'phishing' 'phishing']


### Splits data in two with 80% and 20% of samples

In [16]:
X_80, X_20, Y_80, Y_20 = train_test_split(X ,Y, test_size=0.2, random_state=5)

print(X_80, '\n')
print(Y_80, '\n')

print(X_20, '\n')
print(Y_20, '\n')

#All class columns except 'class' last column
class_columns = df.columns.values[:-1]
print(class_columns)

[[ 5.133  0.827  0.018 ...  0.    -1.     1.   ]
 [10.25   0.625  0.028 ...  0.688 -1.     0.852]
 [ 3.467  0.778  0.101 ... -1.     0.4    0.77 ]
 ...
 [ 3.75   0.784  0.642 ...  0.295  0.326  0.801]
 [ 4.     0.442  0.154 ...  0.182  0.5    0.76 ]
 [ 2.     0.146  0.049 ...  0.    -1.     0.764]] 

['benign' 'phishing' 'malware' ... 'spam' 'spam' 'phishing'] 

[[ 2.667  0.678  0.552 ...  0.086  0.104  0.797]
 [ 3.667  0.755  0.5   ...  0.388  0.473  0.766]
 [ 4.182  0.7    0.025 ...  0.    -1.     0.827]
 ...
 [ 2.75   0.659  0.489 ...  0.226  0.279  0.87 ]
 [ 2.667  0.715  0.626 ...  0.207  0.234  0.814]
 [ 2.667  0.82   0.756 ...  0.057  0.062  0.823]] 

['Defacement' 'spam' 'Defacement' ... 'spam' 'spam' 'Defacement'] 

['avgpathtokenlen' 'pathurlRatio' 'ArgUrlRatio' 'argDomanRatio'
 'domainUrlRatio' 'pathDomainRatio' 'argPathRatio'
 'CharacterContinuityRate' 'NumberRate_URL' 'NumberRate_FileName'
 'NumberRate_AfterPath' 'Entropy_Domain']


### Create dataframe with 80%

In [17]:
df_80 = pandas.DataFrame(X_80, columns=class_columns)
df_80['class'] = Y_80
df_80.head()

Unnamed: 0,avgpathtokenlen,pathurlRatio,ArgUrlRatio,argDomanRatio,domainUrlRatio,pathDomainRatio,argPathRatio,CharacterContinuityRate,NumberRate_URL,NumberRate_FileName,NumberRate_AfterPath,Entropy_Domain,class
0,5.133,0.827,0.018,0.167,0.109,7.583,0.022,0.75,0.091,0.0,-1.0,1.0,benign
1,10.25,0.625,0.028,0.1,0.278,2.25,0.044,0.35,0.306,0.688,-1.0,0.852,phishing
2,3.467,0.778,0.101,0.667,0.152,5.133,0.13,0.733,0.152,-1.0,0.4,0.77,malware
3,4.857,0.52,0.026,0.067,0.39,1.333,0.05,0.9,0.0,0.0,-1.0,0.775,malware
4,3.2,0.738,0.488,2.733,0.179,4.133,0.661,0.533,0.048,0.0,0.0,0.817,malware


### Create dataframe with 20%

In [18]:
df_20 = pandas.DataFrame(X_20, columns=class_columns)
df_20['class'] = Y_20
df_20.head()

Unnamed: 0,avgpathtokenlen,pathurlRatio,ArgUrlRatio,argDomanRatio,domainUrlRatio,pathDomainRatio,argPathRatio,CharacterContinuityRate,NumberRate_URL,NumberRate_FileName,NumberRate_AfterPath,Entropy_Domain,class
0,2.667,0.678,0.552,2.286,0.241,2.81,0.814,0.667,0.058,0.086,0.104,0.797,Defacement
1,3.667,0.755,0.5,2.75,0.182,4.15,0.663,0.75,0.246,0.388,0.473,0.766,spam
2,4.182,0.7,0.025,0.118,0.213,3.294,0.036,0.824,0.0,0.0,-1.0,0.827,Defacement
3,4.667,0.803,0.513,3.75,0.137,5.875,0.638,0.25,0.205,0.348,0.4,0.77,spam
4,4.714,0.696,0.036,0.2,0.179,3.9,0.051,0.5,0.054,0.167,-1.0,0.88,spam


### Check the proportion of classes on each dataset 

In [19]:
class_counts = df['class'].value_counts()
print(class_counts, '\n')
print('Total samples: ', len(df['class']), '\n')

class_counts_80 = df_80['class'].value_counts()
print(class_counts_80, '\n')
print('Total samples: ', len(df_80['class']), '\n')

class_counts_20 = df_20['class'].value_counts()
print(class_counts_20, '\n')
print('Total samples: ', len(df_20['class']), '\n')

Defacement    7930
benign        7776
phishing      7311
malware       6707
spam          6693
Name: class, dtype: int64 

Total samples:  36417 

Defacement    6376
benign        6234
phishing      5855
malware       5369
spam          5299
Name: class, dtype: int64 

Total samples:  29133 

Defacement    1554
benign        1542
phishing      1456
spam          1394
malware       1338
Name: class, dtype: int64 

Total samples:  7284 



### Generate Files with splited datasets

In [20]:
if RUNNING_IN_COLAB:
  csv_80_path = '/content/drive/My Drive/Colab Notebooks/DataScience/All_Infogain_80.csv'
  csv_20_path = '/content/drive/My Drive/Colab Notebooks/DataScience/All_Infogain_20.csv'
else:
  csv_80_path = '../../dataset/splitted/All_Infogain_80.csv'
  csv_20_path = '../../dataset/splitted/All_Infogain_20.csv'

#index = False to avoid printing index column
df_80.to_csv(csv_80_path, sep=',', index=False)
df_20.to_csv(csv_20_path, sep=',', index=False)