# CM4107 Advanced Artificial Intelligence
## Coursework Part 1 - Dataset Setup
## Alistair Quinn 1701183

### Imports 

In [23]:
%matplotlib inline
import numpy as np
import scipy.special 
import matplotlib.pyplot as plt
import pandas as pd
import random
import operator
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
seed = 100

### Load Datasets 

In [24]:
wine_dataset = pd.read_csv('datasets/winequality-white.csv',sep=';')
breast_cancer_dataset = pd.read_csv('datasets/breast_cancer_data.csv')

### Util Functions 

In [25]:
#https://www.kaggle.com/thebrownviking20/intro-to-keras-with-breast-cancer-data-ann
#Originaly tried a min/max normalize method but produced strange results 
def standardize(dataset):
    values = dataset.values
    scaler = preprocessing.StandardScaler()
    values_scaled = scaler.fit_transform(values)
    return pd.DataFrame(values_scaled,columns=dataset.columns)

### Pre Proccessing 

#### Wine Dataset 

In [26]:
#Shape
wine_dataset.shape

(4898, 12)

In [27]:
#Columns
wine_dataset.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [28]:
#Null Values
wine_dataset.isnull().all()

fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [29]:
#Head
wine_dataset.head

<bound method NDFrame.head of       fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.0             0.270         0.36           20.70      0.045   
1               6.3             0.300         0.34            1.60      0.049   
2               8.1             0.280         0.40            6.90      0.050   
3               7.2             0.230         0.32            8.50      0.058   
4               7.2             0.230         0.32            8.50      0.058   
5               8.1             0.280         0.40            6.90      0.050   
6               6.2             0.320         0.16            7.00      0.045   
7               7.0             0.270         0.36           20.70      0.045   
8               6.3             0.300         0.34            1.60      0.049   
9               8.1             0.220         0.43            1.50      0.044   
10              8.1             0.270         0.41            1.45      0.033  

To make this dataset easier to use with ANN class going to move quality to the front of the dataset

In [30]:
#Move quality to start
cols = wine_dataset.columns.tolist()
cols.insert(0,cols.pop(cols.index('quality')))
wine_dataset = wine_dataset.reindex(columns=cols)
print(wine_dataset.columns)

Index(['quality', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')


#### Breast Cancer Dataset 

In [31]:
#Shape
breast_cancer_dataset.shape

(569, 33)

In [32]:
#Columns
breast_cancer_dataset.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [33]:
#Null Values
breast_cancer_dataset.isnull().all()

id                         False
diagnosis                  False
radius_mean                False
texture_mean               False
perimeter_mean             False
area_mean                  False
smoothness_mean            False
compactness_mean           False
concavity_mean             False
concave points_mean        False
symmetry_mean              False
fractal_dimension_mean     False
radius_se                  False
texture_se                 False
perimeter_se               False
area_se                    False
smoothness_se              False
compactness_se             False
concavity_se               False
concave points_se          False
symmetry_se                False
fractal_dimension_se       False
radius_worst               False
texture_worst              False
perimeter_worst            False
area_worst                 False
smoothness_worst           False
compactness_worst          False
concavity_worst            False
concave points_worst       False
symmetry_w

Going to remove usless ID column, and the empty Unnamed column. Unnamed column is likely a csv reading issue

In [34]:
#Breast Cancer Dataset
#Remove useless ID column 
breast_cancer_dataset.drop(columns=['id'],inplace=True)
#Change diagnosis column to integer instead of string
breast_cancer_dataset.diagnosis.replace(['M'], [1], inplace=True) #Replaces Malignant with 1 and Benign with 0
breast_cancer_dataset.diagnosis.replace(['B'],[0], inplace=True)
#Remove Unnamed Column 
breast_cancer_dataset.dropna(how='all',axis=1,inplace=True)
#normalise 
#breast_cancer_dataset = minMaxNormalize(breast_cancer_dataset)
print(breast_cancer_dataset.head)

<bound method NDFrame.head of      diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0            1       17.990         10.38          122.80     1001.0   
1            1       20.570         17.77          132.90     1326.0   
2            1       19.690         21.25          130.00     1203.0   
3            1       11.420         20.38           77.58      386.1   
4            1       20.290         14.34          135.10     1297.0   
5            1       12.450         15.70           82.57      477.1   
6            1       18.250         19.98          119.60     1040.0   
7            1       13.710         20.83           90.20      577.9   
8            1       13.000         21.82           87.50      519.8   
9            1       12.460         24.04           83.97      475.9   
10           1       16.020         23.24          102.70      797.8   
11           1       15.780         17.89          103.60      781.0   
12           1       19.170       

### Split Datasets into X and Y 

#### Wine Dataset 

In [35]:
wine_x = wine_dataset.iloc[:,1:12]
wine_y = wine_dataset.iloc[:,0:1]

#### Breast Cancer Dataset 

In [36]:
breast_cancer_x = breast_cancer_dataset.iloc[:,1:33]
breast_cancer_y = breast_cancer_dataset.iloc[:,0:1]

### Split Datasets into Train and Test

#### Wine Dataset

In [37]:
wine_train_x, wine_test_x, wine_train_y, wine_test_y = train_test_split(wine_x,wine_y,train_size=0.5,test_size=0.5,random_state=seed)

#### Breast Cancer Dataset 

In [38]:
breast_cancer_train_x, breast_cancer_test_x, breast_cancer_train_y, breast_cancer_test_y = train_test_split(breast_cancer_x,breast_cancer_y,train_size=0.5,test_size=0.5,random_state=seed)

### Standardization  

Originally used a normalisation method (0-1) but this produced strange results with ANN so standardized datasets instead. Removes mean and scales to unit variance. 

#### Wine Dataset

In [39]:
wine_train_x = standardize(wine_train_x)
wine_test_x = standardize(wine_test_x)

#### Breast Cancer Dataset 

In [40]:
breast_cancer_train_x = standardize(breast_cancer_train_x)
breast_cancer_test_x = standardize(breast_cancer_test_x)

### Recombine X and Y Datasets 

#### Wine Dataset 

In [41]:
#Train
#Reset Index 
wine_train_x.reset_index(drop=True,inplace=True)
wine_train_y.reset_index(drop=True,inplace=True)
wine_train = pd.concat([wine_train_y,wine_train_x],axis=1)
#Test
wine_test_x.reset_index(drop=True,inplace=True)
wine_test_y.reset_index(drop=True,inplace=True)
wine_test = pd.concat([wine_test_y,wine_test_x],axis=1)
#Combined
wine_dataset = pd.concat([wine_train,wine_test])

#### Breast Cancer Dataset 

In [42]:
#Train
breast_cancer_train_x.reset_index(drop=True,inplace=True)
breast_cancer_train_y.reset_index(drop=True,inplace=True)
breast_cancer_train = pd.concat([breast_cancer_train_y,breast_cancer_train_x],axis=1,sort=False)
#Test
breast_cancer_test_x.reset_index(drop=True,inplace=True)
breast_cancer_test_y.reset_index(drop=True,inplace=True)
breast_cancer_test = pd.concat([breast_cancer_test_y,breast_cancer_test_x],axis=1,sort=False)
#Combined
breast_cancer_dataset = pd.concat([breast_cancer_train,breast_cancer_test])

### Save Datasets as CSV

#### Wine Dataset 

In [43]:
#Train
wine_train.to_csv('datasets/wine/wine_train.csv',index=False)
#Test
wine_test.to_csv('datasets/wine/wine_test.csv',index=False)
#Combined
wine_dataset.to_csv('datasets/wine/wine.csv',index=False)

#### Breast Cancer Dataset 

In [44]:
#Train
breast_cancer_train.to_csv('datasets/breast_cancer/breast_cancer_train.csv',index=False)
#Test
breast_cancer_test.to_csv('datasets/breast_cancer/breast_cancer_test.csv',index=False)
#Combined
breast_cancer_dataset.to_csv('datasets/breast_cancer/breast_cancer.csv',index=False)