## Imports

In [3]:
import numpy as np

np.set_printoptions(threshold=np.inf)
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)

################################################

import pandas as pd

## Flags

In [4]:
#Header interferes with names - investigate later
headerFlag = None
delimFlag = ','
namesFlag = ['att1', 'att2', 'att3', 'att4', 'OwlLabel']
# body-length, wing-length, body-width, wing-width, type.

labelColumn = -1

## Functions

In [5]:
### CATH

def scale_and_encode(df):

    # Get Features
    # if features are continuous - revisit
    feats = df.drop(df.columns[[-1]], axis=1).apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

    # Get encoded Labels
    labels = pd.get_dummies(df[df.columns[-1:]])

    # Recombine feats & labels
    combine_data = pd.concat([feats, labels], axis=1)
    
    return combine_data, len(feats.columns)


#Gets data in the correct format and adds a bias to the features
def feat_label_bias(df, label_ind):

    list_data_pre = df.values.tolist()
    list_data = [[x[0:label_ind] + [1.0] for x in list_data_pre]] + [[x[label_ind:] for x in list_data_pre]]
    
    new_df = pd.DataFrame(data=list_data).transpose()
    new_df.columns = ['feats', 'labels']

    
    return new_df

In [24]:
### ANDREW

def load_file(filename, names):
    """ Loads a csv into a dataframe"""
    
    print(filename)
    data = read_csv(filename, names=names) 
    return data


def train_test_split(data, train_frac=0.666):
    """ Takes a dataset and splits it into a training and test set with measures to ensure even distribution
    
    Keyword Arguments:
    data -- pandas dataframe of data to be split
    train_frac -- franction of the data to be used for training, the rest will be used for test
    
    N.B: The feature labes must be the last column in the dataset"""
    
    labels = data[data.columns[-1]] # get last column in dataframe which will contain labels
    num_labels = len(labels.unique())
    split_threshold = 1/num_labels
    
    train = data.sample(frac=train_frac, random_state=np.random.RandomState())
    test = data.drop(train.index)

    unique_labels = labels.unique()
  
    label_splits_train = [sum(train[train.columns[-1]] == x) for x in unique_labels]
    label_splits_test = [sum(test[test.columns[-1]] == x) for x in unique_labels]   
    
    split_vals_train = [x/len(train) for x in label_splits_train]
    split_vals_test = [x/len(test) for x in label_splits_test]
    
    is_even_split_train = [(split_val < (split_threshold + 0.1)) & (split_val > (split_threshold - 0.1))
                           for split_val in split_vals_train] # Ensure a fair split 
    is_even_split_test = [(split_val < (split_threshold + 0.1)) & (split_val > (split_threshold - 0.1))
                           for split_val in split_vals_test]
    
    if not is_even_split_train and not is_even_split_test:
        return train_test_split(data, train_frac)
    else:
        return train, test
    
    #split_val: decimal representation of how much one type makes up of the while data set
    #split_threshold: how much each of the labels should represent in the data set
    
    
def rand_train_test_split(data, train_frac=0.666):
    """ Takes a dataset and splits it into a training and test set
    
    Keyword Arguments:
    data -- pandas dataframe of data to be split
    train_frac -- franction of the data to be used for training, the rest will be used for test
    
    N.B: The feature labes must be the last column in the dataset"""
    
    labels = data[data.columns[-1]] # get last column in dataframe which will contain labels
    
    sample_size = int(len(data)*train_frac)
    
    rows = np.random.choice(data.index.values, sample_size)
    train = data.ix[rows]
    test = data.drop(train.index)
    
    return train, test


def n_random_split(n, data, splitter=rand_train_test_split):
    """ Takes a pandas dataframe and returns n amount of random splits of training and test data in a dict
    
    Keyword Arguments:
    n -- number of training/test sets to return
    data -- dataframe to split 
    splitter -- function which splits the data into train and test"""
    
    d = {}
    for i in range(n):
        train, test = splitter(data)
        d[i] = [train, test]
        
    return d
    
    

In [28]:
d = n_random_split(10, data)

for index, datasets in d.items():
    train_data = datasets[0]
    test_data = datasets[1]
    info = 'Index:{0} \n Num training samples:{1} \n Num test samples:{2}\n'.format(index, len(train_data), len(test_data))
    print(info)

Index:0 
 Num training samples:59 
 Num test samples:47

Index:1 
 Num training samples:59 
 Num test samples:47

Index:2 
 Num training samples:59 
 Num test samples:43

Index:3 
 Num training samples:59 
 Num test samples:46

Index:4 
 Num training samples:59 
 Num test samples:45

Index:5 
 Num training samples:59 
 Num test samples:43

Index:6 
 Num training samples:59 
 Num test samples:49

Index:7 
 Num training samples:59 
 Num test samples:48

Index:8 
 Num training samples:59 
 Num test samples:47

Index:9 
 Num training samples:59 
 Num test samples:44



# MAIN

In [13]:
# Load Data
in_data = pd.read_csv('C:/Users/AMCBR/MyStuff/College Notes & Work/Year 4/Machine Learning & Data Mining/Assignment 3/owls15.csv', header=headerFlag, sep=delimFlag, names=namesFlag, index_col=False)

# Make data binary for now
in_data = in_data[in_data.OwlLabel != 'BarnOwl']


# Scale the features & Encode the labels
data_scaled, label_ind = scale_and_encode(in_data)

# Get data in the right form & add bias
data = feat_label_bias(data_scaled, label_ind)

# Split Data into Training and Test Sets
train_raw, test_raw = rand_train_test_split(data, train_frac=0.666)


In [16]:
train_raw

Unnamed: 0,feats,labels
83,"[-0.18611111111111178, 0.4761437908496737, 0.4...","[0.0, 1.0]"
7,"[0.11388888888888826, -0.2885620915032676, -0....","[1.0, 0.0]"
47,"[-0.18611111111111178, 0.1232026143790854, 0.2...","[0.0, 1.0]"
18,"[-0.0861111111111117, -0.2885620915032676, -0....","[1.0, 0.0]"
45,"[-0.18611111111111178, 0.18202614379085016, 0....","[0.0, 1.0]"
47,"[-0.18611111111111178, 0.1232026143790854, 0.2...","[0.0, 1.0]"
77,"[0.013888888888888395, 0.4173202614379089, 0.4...","[0.0, 1.0]"
62,"[0.013888888888888395, 0.3290849673202619, 0.3...","[0.0, 1.0]"
47,"[-0.18611111111111178, 0.1232026143790854, 0.2...","[0.0, 1.0]"
35,"[0.2638888888888884, -0.14150326797385585, -0....","[1.0, 0.0]"


In [None]:
from random import choice 
from numpy import array, dot,random 

def train_perceptron(train_dat):

    unit_step = lambda x: 0 if x < 0 else 1 

    w = random.rand(3) 
    errors = [] 
    learning_rate = 0.2 
    iter_no = 0
    iterations = 5
    
    for row in train_dat.iterrows():
        if iter_no < iterations:
            iter_no = iter_no + 1
            print(str(row[0]) + " is the row index")
            print('Feats: ', row[1][0])
            print('Labels: ', row[1][1])
            print()
    

    return 0

In [None]:
train_perceptron(train_data)

In [None]:
C = []
A = time.time()
for ir in t.itertuples():
    C.append((ir[1], ir[2]))    
B.append(time.time()-A)







## Perceptron Example 1

In [None]:
from random import choice 
from numpy import array, dot,random 

unit_step = lambda x: 0 if x < 0 else 1 

# Extra 1s are biases
training_data = [ 
    (array([0,0,1]), 0), 
    (array([0,1,1]), 1), 
    (array([1,0,1]), 1), 
    (array([1,1,1]), 1), 
    ]
    
w = random.rand(3) 
errors = [] 
learning_rate = 0.2 
iterations = 100 



In [None]:
print('weights = ', w)
print()

#  for i in range(iterations): 
x, expected = choice(training_data) # randomly samples a data point from training_data - gives feats, lab
result = dot(w, x) # weights by inputs
error = expected - unit_step(result) #lab 1/0 - 0 if result>0, else 1
errors.append(error) 
w += learning_rate * error * x 

print('Train input = ', x, ',', expected)
print()

print('weights x feats = ', result)
print()
print('expected - unit_step(result) #lab 1/0 - 0 if result>0, else 1 = ', error)
print()
print('errors = ', errors)
print()
print('new weights = ', w)
print()

# print(errors)

# for x, _ in training_data: 
#     result = dot(x, w) 
#     print("{}: {} -> {}".format(x[:3], result, unit_step(result)))

# Andrew Code

In [None]:
def train_test_split(data, train_frac=0.666):
    
    num_labels = len(data.OwlLabel.unique())
    split_threshold = 1/num_labels
    
    train = data.sample(frac=train_frac, random_state=np.random.RandomState())
    test = data.drop(train.index)

    split_val = len(train.loc[train['OwlLabel'] == 'LongEaredOwl'])/len(train) #De-hardcode type values
    is_even_split_train =  (split_val < (split_threshold + 0.1)) & (split_val > (split_threshold - 0.1)) # Ensure a fair split of all types of owl
  
#     print('split val: {}'.format(split_val))
#     print('split threshold: {}'.format(split_threshold))
    
#     print(len(train))
#     print(train)
#     print(len(test))
#     print(test)
    
    if not is_even_split_train:
        return train_test_split(data, train_frac)
    else:
        return train, test
    
    #split_val: decimal representation of how much one type makes up of the while data set
    #split_threshold: how much each of the labels should represent in the data set