### **Classification using sklearn and keras (with pandas)**

<font color="red">File access required:</font> In Colab this notebook requires first uploading files **Cities.csv**, **Players.csv**, and **Titanic.csv** using the *Files* feature in the left toolbar. If running the notebook on a local computer, simply ensure these files are in the same workspace as the notebook.

In [75]:
# Set-up
import csv
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras import Sequential
from keras.layers import Dense
from numpy.random import seed
import tensorflow

### Prepare Cities data for classification
Predict <i>temperature category</i> from other features

In [52]:
# Read Cities.csv into dataframe, add column for temperature category
# Note: For a dataframe D and integer i, D.loc[i] is the i-th row of D
f = open('Cities.csv')
cities = pd.read_csv(f)
categories = []
for i in range(len(cities)):
    if cities.loc[i]['temperature'] < 5:
        categories.append('cold')
    elif cities.loc[i]['temperature'] < 9:
        categories.append('cool')
    elif cities.loc[i]['temperature'] < 15:
        categories.append('warm')
    else: categories.append('hot')
cities['category'] = categories
print("cold:", len(cities[(cities.category == 'cold')]))
print("cool:", len(cities[(cities.category == 'cool')]))
print("warm:", len(cities[(cities.category == 'warm')]))
print("hot:", len(cities[(cities.category == 'hot')]))

cold: 17
cool: 92
warm: 79
hot: 25


In [53]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)

print('Training set', numtrain, 'items')
print('Test set', numitems - numtrain, 'items')

citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:]

Training set 181 items
Test set 32 items


### K-nearest-neighbors classification

In [54]:
features = ['longitude', 'latitude']
neighbors = 3
predict = 'category'

classifier = KNeighborsClassifier(neighbors)
classifier.fit(citiesTrain[features], citiesTrain[predict])


predictions = classifier.predict(citiesTest[features])

# Calculate accuracy
actuals = list(citiesTest[predict])
correct = 0

for i in range(len(actuals)):
  print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))
# Comment out print, try different values for neighbors, different features

Predicted: warm  Actual: cool
Predicted: warm  Actual: warm
Predicted: hot  Actual: warm
Predicted: warm  Actual: warm
Predicted: cold  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cold
Predicted: cold  Actual: cold
Predicted: cool  Actual: warm
Predicted: cool  Actual: cold
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: warm
Predicted: warm  Actual: warm
Predicted: hot  Actual: hot
Predicted: cold  Actual: cold
Predicted: cold  Actual: cold
Predicted: cool  Actual: cold
Predicted: hot  Actual: hot
Predicted: warm  Actual: cool
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Accuracy: 0.6875


### <font color="green">**Your Turn: K-nearest-neighbors on World Cup data**</font>
<font color="green">Predict <i>position</i> from one or more of <i>minutes, shots, passes, tackles, saves</i></font>

In [55]:
# This cell does all the set-up, including reordering the data to avoid team bias.
f = open('Players.csv')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
numitems = len(players)
percenttrain = 0.92
numtrain = int(numitems*percenttrain)
print('Training set', numtrain, 'items')
print('Test set', numitems - numtrain, 'items')
playersTrain = players[0:numtrain]
playersTest = players[numtrain:]

Training set 547 items
Test set 48 items


In [56]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
neighbors = 8
predict = 'position'
classifier = KNeighborsClassifier(neighbors)
classifier.fit(playersTrain[features], playersTrain[predict])
predictions = classifier.predict(playersTest[features])
# Calculate accuracy
actuals = list(playersTest[predict])
correct = 0
for i in range(len(actuals)):
  print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))
# Comment out print, try different values for neighbors, different features

Predicted: defender  Actual: midfielder
Predicted: goalkeeper  Actual: goalkeeper
Predicted: defender  Actual: midfielder
Predicted: defender  Actual: defender
Predicted: defender  Actual: forward
Predicted: midfielder  Actual: midfielder
Predicted: midfielder  Actual: midfielder
Predicted: midfielder  Actual: midfielder
Predicted: defender  Actual: defender
Predicted: midfielder  Actual: defender
Predicted: midfielder  Actual: forward
Predicted: defender  Actual: defender
Predicted: midfielder  Actual: defender
Predicted: midfielder  Actual: defender
Predicted: forward  Actual: forward
Predicted: goalkeeper  Actual: goalkeeper
Predicted: defender  Actual: defender
Predicted: midfielder  Actual: forward
Predicted: forward  Actual: defender
Predicted: midfielder  Actual: forward
Predicted: forward  Actual: forward
Predicted: defender  Actual: midfielder
Predicted: forward  Actual: forward
Predicted: defender  Actual: midfielder
Predicted: defender  Actual: midfielder
Predicted: forward 

### <font color="green">**Your Turn Extra: K-nearest-neighbors on Titanic data - Graded**</font>
<font color="green">Predict <i>survived</i> from one or more of <i>gender, age, class, fare, embarked</i></font>

In [57]:
# This cell does all the set-up
f = open('Titanic.csv')
titanic = pd.read_csv(f)
# Convert gender and embarked to numeric values and missing ages to average age
titanic['gender'].replace({'M':0, 'F':1}, inplace=True)
titanic['embarked'].replace({'Cherbourg':0, 'Southampton':1, 'Queenstown':2}, inplace=True)
avg_age = np.average(titanic['age'].dropna().tolist())
titanic['age'].fillna(avg_age, inplace=True)
# Create training and test sets
numitems = len(titanic)
percenttrain = 0.92
numtrain = int(numitems*percenttrain)
print('Training set', numtrain, 'items')
print('Test set', numitems - numtrain, 'items')
titanicTrain = titanic[0:numtrain]
titanicTest = titanic[numtrain:]

Training set 819 items
Test set 72 items


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['gender'].replace({'M':0, 'F':1}, inplace=True)
  titanic['gender'].replace({'M':0, 'F':1}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['embarked'].replace({'Cherbourg':0, 'Southampton':1, 'Queenstown':2}, inplace=True)
  titanic['embarked'].repl

In [58]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['gender', 'age', 'class']
neighbors = 8
predict = 'survived'
classifier = KNeighborsClassifier(neighbors)
classifier.fit(titanicTrain[features], titanicTrain[predict])
predictions = classifier.predict(titanicTest[features])
# Calculate accuracy
actuals = list(titanicTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))

Accuracy: 0.80556


### Decision tree classification

In [59]:
features = ['longitude','latitude']
split = 2
predict = 'category'

# random forest
for x in range(1, 10):
  dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # split parameter is optional
  dt.fit(citiesTrain[features], citiesTrain[predict])

  predictions = dt.predict(citiesTest[features])
  # print(x ....)

# aggregated predicted output



# Calculate accuracy
actuals = list(citiesTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))
# Try different values for split, different features

Accuracy: 0.65625


### "Forest" of decision trees

In [60]:
features = ['longitude', 'latitude']
split = 10
trees = 10
predict = 'category'

rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
rf.fit(citiesTrain[features], citiesTrain[predict])


predictions = rf.predict(citiesTest[features])
# Calculate accuracy
actuals = list(citiesTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))
# Try different values for split and trees, different features

Accuracy: 0.78125


### <font color="green">**Your Turn: Decision tree and forest of trees on World Cup data - Graded**</font>

In [61]:
# SINGLE TREE
# Try different features and different values for split.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
split = 7
predict = 'position'
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split, max_depth=9) # parameter is optional
dt.fit(playersTrain[features], playersTrain[predict])
predictions = dt.predict(playersTest[features])
# Calculate accuracy
actuals = list(playersTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))

Accuracy: 0.70833


In [62]:
# FOREST OF TREES
# Try different features and different values for split and trees.
# What's the highest accuracy you can get?
from sklearn.ensemble import RandomForestClassifier
from itertools import combinations
all_features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
split_values = [2, 5, 10, 20]
tree_values = [5, 10, 20, 50, 100]

predict = 'position'

best_accuracy = 0
best_config = {}

for r in range(2, len(all_features)+1):
    for features in combinations(all_features, r):
        for split in split_values:
            for trees in tree_values:
                rf = RandomForestClassifier(
                    random_state=0,
                    min_samples_split=split,
                    n_estimators=trees
                )
                # Train model
                rf.fit(playersTrain[list(features)], playersTrain[predict])
                predictions = rf.predict(playersTest[list(features)])

                # Calculate accuracy
                actuals = list(playersTest[predict])
                correct = sum(pred == actual for pred, actual in zip(predictions, actuals))
                accuracy = correct / len(actuals)

                # Check if best
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_config = {
                        'features': features,
                        'split': split,
                        'trees': trees
                    }

print("Best Accuracy:", round(best_accuracy, 5))
print("Best Configuration:", best_config)


Best Accuracy: 0.79167
Best Configuration: {'features': ('minutes', 'shots', 'passes', 'saves'), 'split': 20, 'trees': 20}


### <font color="green">**Your Turn Extra: Decision tree and forest of trees on Titanic data - Graded**</font>

In [63]:
# SINGLE TREE
# Try different features and different values for split.
# What's the highest accuracy you can get?
features = ['gender', 'age', 'class', 'fare', 'embarked']
split = 10
predict = 'survived'
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # parameter is optional
dt.fit(titanicTrain[features], titanicTrain[predict])
predictions = dt.predict(titanicTest[features])
# Calculate accuracy
actuals = list(titanicTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))

Accuracy: 0.80556


In [64]:
feature_combinations = [
    ['gender', 'class'],
    ['gender', 'age', 'class'],
    ['gender', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare', 'embarked'],
]

split_values = [2, 5, 10, 15, 20, 25, 30, 40, 50]

best_accuracy = 0
best_features = []
best_split = 0

for features in feature_combinations:
    for split in split_values:
        dt = DecisionTreeClassifier(random_state=0, min_samples_split=split)
        dt.fit(titanicTrain[features], titanicTrain['survived'])
        predictions = dt.predict(titanicTest[features])

        # Calculate accuracy
        accuracy = sum(p == a for p, a in zip(predictions, titanicTest['survived'])) / len(titanicTest)

        # Update best configuration
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = features
            best_split = split

print('Best Features:', best_features)
print('Best Split:', best_split)
print('Best Accuracy:', round(best_accuracy, 5))

Best Features: ['gender', 'class']
Best Split: 2
Best Accuracy: 0.81944


In [65]:
# FOREST OF TREES
# Try different features and different values for split and trees.
# What's the highest accuracy you can get?
features = ['gender', 'age', 'class', 'fare', 'embarked']
split = 10
trees = 10
predict = 'survived'
rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
rf.fit(titanicTrain[features], titanicTrain[predict])
predictions = rf.predict(titanicTest[features])
# Calculate accuracy
actuals = list(titanicTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))

Accuracy: 0.79167


In [66]:
feature_combinations = [
    ['gender', 'class'],
    ['gender', 'age', 'class'],
    ['gender', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare', 'embarked'],
]

split_values = [2, 5, 10, 15, 20]
tree_values = [10, 20, 30, 50, 100, 150]

best_accuracy = 0
best_features = []
best_split = 0
best_trees = 0

for features in feature_combinations:
    for split in split_values:
        for trees in tree_values:
            rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
            rf.fit(titanicTrain[features], titanicTrain['survived'])
            predictions = rf.predict(titanicTest[features])

            # Calculate accuracy
            accuracy = sum(p == a for p, a in zip(predictions, titanicTest['survived'])) / len(titanicTest)

            # Update best configuration if accuracy improves
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_features = features
                best_split = split
                best_trees = trees

print('Best Features:', best_features)
print('Best Split:', best_split)
print('Best Trees:', best_trees)
print('Best Accuracy:', round(best_accuracy, 5))

Best Features: ['gender', 'age', 'class', 'fare']
Best Split: 5
Best Trees: 10
Best Accuracy: 0.83333


### Naive Bayes classification

In [67]:
features = ['longitude', 'latitude']
predict = 'category'

nb = GaussianNB()
nb.fit(citiesTrain[features], citiesTrain[predict])

predictions = nb.predict(citiesTest[features])

# Calculate accuracy
actuals = list(citiesTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))
# Try different features

Accuracy: 0.78125


### <font color="green">**Your Turn: Naive Bayes on World Cup data**</font>

In [68]:
# Try different features. What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
predict = 'position'
nb = GaussianNB()
nb.fit(playersTrain[features], playersTrain[predict])
predictions = nb.predict(playersTest[features])
# Calculate accuracy
actuals = list(playersTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))

Accuracy: 0.6875


In [69]:
feature_combinations = [
    ['minutes'],
    ['shots'],
    ['minutes', 'shots'],
    ['minutes', 'shots', 'passes'],
    ['shots', 'passes', 'tackles'],
    ['minutes', 'shots', 'passes', 'tackles', 'saves'],
    ['shots', 'passes', 'tackles', 'saves'],
    ['minutes', 'tackles', 'saves'],
]

best_accuracy = 0
best_features = []

for features in feature_combinations:
    nb = GaussianNB()
    nb.fit(playersTrain[features], playersTrain['position'])
    predictions = nb.predict(playersTest[features])

    # Calculate accuracy
    accuracy = sum(p == a for p, a in zip(predictions, playersTest['position'])) / len(playersTest)

    # Update best
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_features = features

print('Best Features:', best_features)
print('Best Accuracy:', round(best_accuracy, 5))

Best Features: ['shots', 'passes', 'tackles', 'saves']
Best Accuracy: 0.75


### <font color="green">**Your Turn Extra: Naive Bayes on Titanic data - Graded**</font>

In [70]:
# Try different features. What's the highest accuracy you can get?
features = ['gender', 'age', 'class', 'fare', 'embarked']
predict = 'survived'
nb = GaussianNB()
nb.fit(titanicTrain[features], titanicTrain[predict])
predictions = nb.predict(titanicTest[features])
# Calculate accuracy
actuals = list(titanicTest[predict])
correct = 0
for i in range(len(actuals)):
# print('Predicted:', predictions[i], ' Actual:', actuals[i])
  if predictions[i] == actuals[i]: correct +=1
print('Accuracy:', round(correct/len(actuals),5))

Accuracy: 0.76389


In [71]:
feature_combinations = [
    ['gender'],
    ['gender', 'class'],
    ['gender', 'age'],
    ['gender', 'age', 'class'],
    ['gender', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare', 'embarked'],
    ['gender', 'class', 'embarked'],
    ['age', 'class', 'fare'],
]

best_accuracy = 0
best_features = []

for features in feature_combinations:
    nb = GaussianNB()
    nb.fit(titanicTrain[features], titanicTrain['survived'])
    predictions = nb.predict(titanicTest[features])

    # Calculate accuracy
    accuracy = sum(p == a for p, a in zip(predictions, titanicTest['survived'])) / len(titanicTest)

    # Update best if accuracy improves
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_features = features

print('Best Features:', best_features)
print('Best Accuracy:', round(best_accuracy, 5))

Best Features: ['gender']
Best Accuracy: 0.81944


### Neural network classification

In [72]:
features = ['longitude', 'latitude']
num_layers = 5 # including input and output, so must be >= 2
num_epochs = 10 # number of iterations over training data
batchsize = 20 # size of each batch during one iteration
layer_outputs = 32 # dimensionality of output of each layer
epoch_tracing = 'yes'
predict = 'category'
# Normalize feature values
sc = StandardScaler()
featurevals_train = sc.fit_transform(citiesTrain[features])
featurevals_test = sc.fit_transform(citiesTest[features])
# Encode labels
encoder = LabelEncoder()
encoder.fit(cities[predict])
labels_train = encoder.transform(citiesTrain[predict])
labels_test = encoder.transform(citiesTest[predict])
# Set up neural-net classifier
seed(1) # to eliminate some randomness
tensorflow.random.set_seed(1) # to eliminate more randomness
classifier = Sequential()
# Input layer
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))

# Hidden layers
for i in range(num_layers-2):
    classifier.add(Dense(layer_outputs, activation='relu',))


# Output layer - first arg is number of labels, softmax for multi-class classification
classifier.add(Dense(4, activation='softmax'))


classifier.compile(optimizer ='adam', loss='sparse_categorical_crossentropy', metrics =['accuracy'])

# Fit to training data
if epoch_tracing == 'yes': v = 2
else: v = 0
hist = classifier.fit(featurevals_train, labels_train, batch_size=batchsize, epochs=num_epochs, verbose=v)
print('Number of epochs:', num_epochs)
print('Final accuracy on training data:', hist.history['accuracy'][-1])
# Evaluate on test data
test_acc = classifier.evaluate(featurevals_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)
# Try different values for num_layers, num_epochs, batch size, layer_outputs, and different features

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


10/10 - 4s - 408ms/step - accuracy: 0.2597 - loss: 1.4137
Epoch 2/10
10/10 - 0s - 36ms/step - accuracy: 0.4751 - loss: 1.3139
Epoch 3/10
10/10 - 0s - 18ms/step - accuracy: 0.4530 - loss: 1.2474
Epoch 4/10
10/10 - 0s - 25ms/step - accuracy: 0.4972 - loss: 1.1879
Epoch 5/10
10/10 - 0s - 15ms/step - accuracy: 0.5359 - loss: 1.1332
Epoch 6/10
10/10 - 0s - 19ms/step - accuracy: 0.6188 - loss: 1.0829
Epoch 7/10
10/10 - 0s - 33ms/step - accuracy: 0.6519 - loss: 1.0335
Epoch 8/10
10/10 - 0s - 46ms/step - accuracy: 0.6630 - loss: 0.9839
Epoch 9/10
10/10 - 0s - 10ms/step - accuracy: 0.6630 - loss: 0.9335
Epoch 10/10
10/10 - 0s - 9ms/step - accuracy: 0.6740 - loss: 0.8804
Number of epochs: 10
Final accuracy on training data: 0.6740331649780273
Accuracy on test data: 0.65625


### <font color="green">**Your Turn: Neural network on World Cup data**</font>

In [73]:
# Try different features and different values for num_layers, num_epochs,
#  batch size, and layer_outputs.
# What's the highest accuracy you can get?
# Note: Although some randomness is removed by setting seeds in the code,
#  you may still see somewhat different accuracy on different runs;
#  changing the order of the features can also affect accuracy
features = ['minutes', 'shots', 'tackles', 'saves']
num_layers = 5 # including input and output, so must be >= 2
num_epochs = 10 # number of iterations over training data
batchsize = 20 # size of each batch during one iteration
layer_outputs = 32 # dimensionality of output of each layer
epoch_tracing = 'no'
predict = 'position'
# Normalize feature values
sc = StandardScaler()
featurevals_train = sc.fit_transform(playersTrain[features])
featurevals_test = sc.fit_transform(playersTest[features])
# Encode labels
encoder = LabelEncoder()
encoder.fit(players[predict])
labels_train = encoder.transform(playersTrain[predict])
labels_test = encoder.transform(playersTest[predict])
# Set up neural-net classifier
seed(1) # to eliminate some randomness
tensorflow.random.set_seed(1) # to eliminate more randomness
classifier = Sequential()
# Input layer
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
# Hidden layers
for i in range(num_layers-2):
    classifier.add(Dense(layer_outputs, activation='relu',))
# Output layer - first arg is number of labels, softmax for multi-class classification
classifier.add(Dense(4, activation='softmax'))
classifier.compile(optimizer ='adam', loss='sparse_categorical_crossentropy', metrics =['accuracy'])
# Fit to training data
if epoch_tracing == 'yes': v = 2
else: v = 0
hist = classifier.fit(featurevals_train, labels_train, batch_size=batchsize, epochs=num_epochs, verbose=v)
print('Number of epochs:', num_epochs)
print('Final accuracy on training data:', hist.history['accuracy'][-1])
# Evaluate on test data
test_acc = classifier.evaluate(featurevals_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Number of epochs: 10
Final accuracy on training data: 0.6234003901481628
Accuracy on test data: 0.6458333134651184


In [76]:
feature_combinations = [
    ['minutes', 'shots', 'passes', 'tackles', 'saves'],
    ['shots', 'passes', 'tackles', 'saves'],
    ['minutes', 'shots', 'tackles', 'saves'],
    ['minutes', 'shots', 'passes'],
]

layer_configs = [3, 4, 5]
epoch_configs = [10, 20, 30]
batch_configs = [10, 20, 30]
output_configs = [16, 32, 64]

best_accuracy = 0
best_params = {}

# Encode labels once
encoder = LabelEncoder()
encoder.fit(players['position'])
labels_train_all = encoder.transform(playersTrain['position'])
labels_test_all = encoder.transform(playersTest['position'])

# Hyperparameter search
for features in feature_combinations:
    # Scale features
    sc = StandardScaler()
    X_train = sc.fit_transform(playersTrain[features])
    X_test = sc.transform(playersTest[features])

    for num_layers in layer_configs:
        for num_epochs in epoch_configs:
            for batchsize in batch_configs:
                for layer_outputs in output_configs:
                    # Set seeds for reproducibility
                    seed(1)
                    tf.random.set_seed(1)

                    # Build model
                    classifier = Sequential()
                    classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
                    for _ in range(num_layers - 2):
                        classifier.add(Dense(layer_outputs, activation='relu'))
                    classifier.add(Dense(4, activation='softmax'))
                    classifier.compile(optimizer='adam',
                                       loss='sparse_categorical_crossentropy',
                                       metrics=['accuracy'])

                    # Train silently
                    classifier.fit(X_train, labels_train_all,
                                   batch_size=batchsize,
                                   epochs=num_epochs,
                                   verbose=0)

                    # Evaluate
                    test_acc = classifier.evaluate(X_test, labels_test_all, verbose=0)[1]

                    if test_acc > best_accuracy:
                        best_accuracy = test_acc
                        best_params = {
                            'features': features,
                            'layers': num_layers,
                            'epochs': num_epochs,
                            'batch': batchsize,
                            'outputs': layer_outputs
                        }

# Train best model and show results
features = best_params['features']
num_layers = best_params['layers']
num_epochs = best_params['epochs']
batchsize = best_params['batch']
layer_outputs = best_params['outputs']

sc = StandardScaler()
X_train = sc.fit_transform(playersTrain[features])
X_test = sc.transform(playersTest[features])

labels_train = encoder.transform(playersTrain['position'])
labels_test = encoder.transform(playersTest['position'])

seed(1)
tf.random.set_seed(1)

classifier = Sequential()
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
for _ in range(num_layers - 2):
    classifier.add(Dense(layer_outputs, activation='relu'))
classifier.add(Dense(4, activation='softmax'))
classifier.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

# Train and trace epochs if needed
epoch_tracing = False
verbose = 2 if epoch_tracing else 0
hist = classifier.fit(X_train, labels_train,
                      batch_size=batchsize,
                      epochs=num_epochs,
                      verbose=verbose)

print('Best Hyperparameters:', best_params)
print('Number of epochs:', num_epochs)
print('Final accuracy on training data:', hist.history['accuracy'][-1])

test_acc = classifier.evaluate(X_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best Hyperparameters: {'features': ['minutes', 'shots', 'passes', 'tackles', 'saves'], 'layers': 5, 'epochs': 30, 'batch': 30, 'outputs': 32}
Number of epochs: 30
Final accuracy on training data: 0.6892138719558716
Accuracy on test data: 0.7291666865348816


### <font color="green">**Your Turn Extra: Neural network on Titanic data**</font>

In [77]:
# Try different features and different values for num_layers, num_epochs,
#  batch size, and layer_outputs.
# What's the highest accuracy you can get?
# Note: Although some randomness is removed by setting seeds in the code,
#  you may still see somewhat different accuracy on different runs;
#  changing the order of the features can also affect accuracy
features = ['gender', 'age', 'class', 'fare', 'embarked']
num_layers = 5 # including input and output, so must be >= 2
num_epochs = 10 # number of iterations over training data
batchsize = 20 # size of each batch during one iteration
layer_outputs = 32 # dimensionality of output of each layer
epoch_tracing = 'no'
predict = 'survived'
# Normalize feature values
sc = StandardScaler()
featurevals_train = sc.fit_transform(titanicTrain[features])
featurevals_test = sc.fit_transform(titanicTest[features])
# Encode labels
encoder = LabelEncoder()
encoder.fit(titanic[predict])
labels_train = encoder.transform(titanicTrain[predict])
labels_test = encoder.transform(titanicTest[predict])
# Set up neural-net classifier
seed(1) # to eliminate some randomness
tensorflow.random.set_seed(1) # to eliminate more randomness
classifier = Sequential()
# Input layer
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
# Hidden layers
for i in range(num_layers-2):
    classifier.add(Dense(layer_outputs, activation='relu',))
# Output layer - first arg is number of labels, softmax for multi-class classification
classifier.add(Dense(4, activation='softmax'))
classifier.compile(optimizer ='adam', loss='sparse_categorical_crossentropy', metrics =['accuracy'])
# Fit to training data
if epoch_tracing == 'yes': v = 2
else: v = 0
hist = classifier.fit(featurevals_train, labels_train, batch_size=batchsize, epochs=num_epochs, verbose=v)
print('Number of epochs:', num_epochs)
print('Final accuracy on training data:', hist.history['accuracy'][-1])
# Evaluate on test data
test_acc = classifier.evaluate(featurevals_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Number of epochs: 10
Final accuracy on training data: 0.8253968358039856
Accuracy on test data: 0.8472222089767456


In [78]:
feature_combinations = [
    ['gender', 'class'],
    ['gender', 'age', 'class'],
    ['gender', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare'],
    ['gender', 'age', 'class', 'fare', 'embarked'],
]

layer_configs = [3, 4, 5]
epoch_configs = [10, 20, 30]
batch_configs = [10, 20, 30]
output_configs = [16, 32, 64]

best_accuracy = 0
best_params = {}

# Encode labels once
encoder = LabelEncoder()
encoder.fit(titanic['survived'])
labels_train_all = encoder.transform(titanicTrain['survived'])
labels_test_all = encoder.transform(titanicTest['survived'])

# Hyperparameter search
for features in feature_combinations:
    # Scale features
    sc = StandardScaler()
    X_train = sc.fit_transform(titanicTrain[features])
    X_test = sc.transform(titanicTest[features])

    for num_layers in layer_configs:
        for num_epochs in epoch_configs:
            for batchsize in batch_configs:
                for layer_outputs in output_configs:
                    # Set seeds for reproducibility
                    seed(1)
                    tf.random.set_seed(1)

                    # Build model
                    classifier = Sequential()
                    classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
                    for _ in range(num_layers - 2):
                        classifier.add(Dense(layer_outputs, activation='relu'))
                    classifier.add(Dense(2, activation='softmax'))  # Binary output
                    classifier.compile(optimizer='adam',
                                       loss='sparse_categorical_crossentropy',
                                       metrics=['accuracy'])

                    # Train silently
                    classifier.fit(X_train, labels_train_all,
                                   batch_size=batchsize,
                                   epochs=num_epochs,
                                   verbose=0)

                    # Evaluate
                    test_acc = classifier.evaluate(X_test, labels_test_all, verbose=0)[1]

                    if test_acc > best_accuracy:
                        best_accuracy = test_acc
                        best_params = {
                            'features': features,
                            'layers': num_layers,
                            'epochs': num_epochs,
                            'batch': batchsize,
                            'outputs': layer_outputs
                        }

# Train the best model and display results
features = best_params['features']
num_layers = best_params['layers']
num_epochs = best_params['epochs']
batchsize = best_params['batch']
layer_outputs = best_params['outputs']

sc = StandardScaler()
X_train = sc.fit_transform(titanicTrain[features])
X_test = sc.transform(titanicTest[features])

labels_train = encoder.transform(titanicTrain['survived'])
labels_test = encoder.transform(titanicTest['survived'])

seed(1)
tf.random.set_seed(1)

classifier = Sequential()
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
for _ in range(num_layers - 2):
    classifier.add(Dense(layer_outputs, activation='relu'))
classifier.add(Dense(2, activation='softmax'))
classifier.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

# Control verbose
epoch_tracing = False
verbose = 2 if epoch_tracing else 0
hist = classifier.fit(X_train, labels_train,
                      batch_size=batchsize,
                      epochs=num_epochs,
                      verbose=verbose)

print('Best Hyperparameters:', best_params)
print('Number of epochs:', num_epochs)
print('Final accuracy on training data:', hist.history['accuracy'][-1])

test_acc = classifier.evaluate(X_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best Hyperparameters: {'features': ['gender', 'class', 'fare'], 'layers': 3, 'epochs': 10, 'batch': 20, 'outputs': 64}
Number of epochs: 10
Final accuracy on training data: 0.8021978139877319
Accuracy on test data: 0.8472222089767456
