In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from collections import Counter

# Data

This data is actually related to the elastic strain engineering task: given the (modified) deformation tensor, predict, whether the tiny diamond crystal under this deformation is a direct-bandgap semiconductor, or not.

I will save you some time and process the data for you.

In [3]:
df = pd.read_csv('c_gw_direct.csv')

In [4]:
df.shape

(9766, 7)

In [5]:
df

Unnamed: 0,exx,exy,exz,eyz,eyy,ezz,is_direct
0,-0.049931,0.019480,-0.012549,0.071484,0.000141,0.036242,False
1,0.057604,0.070671,-0.010140,-0.063918,0.032494,-0.078462,False
2,-0.020174,-0.059848,0.017878,0.077713,-0.087613,-0.079651,False
3,-0.023523,-0.026120,0.045459,-0.035286,-0.000705,0.038353,False
4,-0.050925,-0.006037,-0.042681,-0.016254,-0.019372,-0.037559,False
...,...,...,...,...,...,...,...
9761,-0.096821,0.049679,-0.065829,0.060488,0.008918,0.006889,False
9762,-0.074071,0.027148,-0.056629,-0.068732,0.070384,-0.008722,False
9763,-0.061505,0.015496,0.077352,-0.037589,-0.047513,-0.045182,False
9764,-0.037760,0.037480,0.002881,-0.059958,-0.047096,0.060803,False


In [6]:
df.is_direct.sum()

1376

In [7]:
# Getting arrays from table
X = df[['exx', 'exy', 'exz', 'eyz', 'eyy', 'ezz']].values
y = df['is_direct'].ravel()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=124)
Counter(y_train)

Counter({False: 6723, True: 1089})

# (3 pts) Train and evaluate a simple neural network classifier

- Train a model
- Calculate its precision, recall, f1-score
- Take a look at the confusion matrix

### Train a model

In [9]:
model = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

### Look at the metrics

In [10]:
predictions = model.predict(X_test)

precision = precision_score(y_test, predictions)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions)
print(f'Recall: {recall}')

Precision: 0.8290909090909091
Recall: 0.794425087108014


### Print the confusion matrix

In [11]:
confusion_matrix(y_test, predictions)

array([[1620,   47],
       [  59,  228]])

# (3 pts) Use an undersampling technique to balance classes
- Balance training sample
- Train new model with the same hyperparameters as before
- Evaluate its metrics. Is is better than the original model?

In [12]:
idx_for_larger_class = np.nonzero(y_train == False)[0]
idx_for_smaller_class = np.nonzero(y_train == True)[0]

less_idx_for_larger_class = np.random.choice(idx_for_larger_class,
                                            len(idx_for_smaller_class),
                                            replace=False)

new_idx = list(less_idx_for_larger_class) + list(idx_for_smaller_class)

X_train_new = X_train[new_idx, :]
y_train_new = y_train[new_idx]

model_balanced = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model_balanced.fit(X_train_new, y_train_new)


MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

In [13]:
predictions_balanced = model_balanced.predict(X_test)

precision = precision_score(y_test, predictions_balanced)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions_balanced)
print(f'Recall: {recall}')

Precision: 0.5261194029850746
Recall: 0.9825783972125436


In [14]:
confusion_matrix(y_test, predictions_balanced)

array([[1413,  254],
       [   5,  282]])

## Result:
The random-balanced classes doesn't give us better model.

# (3 pts) Try the imblearn package and its undersampling methods
https://imbalanced-learn.org/stable/under_sampling.html
- Try NearMiss, NeighbourhoodCleaningRule, and EditedNearestNeighbours methods
- Do they perform better than the random undersampling?

### You may need to reinstall sklearn if it is old

In [15]:
# !pip uninstall -v scikit-learn -y

In [16]:
# !pip install -v scikit-learn

In [17]:
# !pip install imblearn

In [18]:
from imblearn.under_sampling import NearMiss, EditedNearestNeighbours, NeighbourhoodCleaningRule

### Balancing with NearMiss

In [19]:
nm1 = NearMiss(version=1)
X_resampled_nm1, y_resampled_nm1 = nm1.fit_resample(X_train, y_train)

In [20]:
model_balanced_nearmiss = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model_balanced_nearmiss.fit(X_resampled_nm1, y_resampled_nm1)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

In [21]:
predictions_balanced_nearmiss = model_balanced_nearmiss.predict(X_test)

precision = precision_score(y_test, predictions_balanced_nearmiss)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions_balanced_nearmiss)
print(f'Recall: {recall}')

Precision: 0.2273972602739726
Recall: 0.578397212543554


### Balancing with NeighbourhoodCleaningRule

In [22]:
ncr = NeighbourhoodCleaningRule()
X_resampled_ncr, y_resampled_ncr = ncr.fit_resample(X_train, y_train)

In [23]:
model_balanced_ncr = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model_balanced_ncr.fit(X_resampled_ncr, y_resampled_ncr)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

In [24]:
predictions_balanced_ncr = model_balanced_ncr.predict(X_test)

precision = precision_score(y_test, predictions_balanced_ncr)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions_balanced_ncr)
print(f'Recall: {recall}')

Precision: 0.7543352601156069
Recall: 0.9094076655052264


### Balancing with EditedNearestNeighbours

In [25]:
enn = EditedNearestNeighbours()
X_resampled_enn, y_resampled_enn = enn.fit_resample(X_train, y_train)

In [26]:
model_balanced_enn = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model_balanced_enn.fit(X_resampled_enn, y_resampled_enn)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

In [27]:
predictions_balanced_enn = model_balanced_enn.predict(X_test)

precision = precision_score(y_test, predictions_balanced_enn)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions_balanced_enn)
print(f'Recall: {recall}')

Precision: 0.6683291770573566
Recall: 0.9337979094076655


# (3 pts) Perform hyperparameter tuning
- Run a cycle through some hyperparater settings in order to find the best ones
- E.g. hidden layer sizes, alpha regularization ...
- You need to be able to outperform your initial model
- I do not care whether you use balanced or imbalanced training set

In [32]:
hidden_layer_sizes_list = [
    (128, 128),
    (128, 64, 64, ),
    (128, 64, ),
    (64, 64, 64, ),
    (64, 64, ),
    (64, 32, 32, ),
]

alpha_list = [
    0.00001, 0.0001, 0.001, 0.01,
]

  (64, 64, )


TypeError: 'tuple' object is not callable

In [None]:
metrics_grid = {
    'hidden_layer_sizes': [],
    'alpha': [],
    'precision': [],
    'recall': [],
}

for hidden_layer_sizes_ in hidden_layer_sizes_list:
    for alpha_ in alpha_list:
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_, alpha=alpha_, max_iter=1000)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        precision = precision_score(y_test, predictions)
        recall = recall_score(y_test, predictions)

        metrics_grid['hidden_layer_sizes'].append(hidden_layer_sizes_)
        metrics_grid['alpha'].append(alpha_)
        metrics_grid['precision'].append(precision)
        metrics_grid['recall'].append(recall)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
pd.DataFrame.from_dict(metrics_grid)

Unnamed: 0,hidden_layer_sizes,alpha,precision,recall
0,"(50, 50)",0.0001,0.84965,0.84669
1,"(50, 50)",0.001,0.842294,0.818815
2,"(50, 50)",0.01,0.79078,0.777003
3,"(50, 50)",0.1,0.833333,0.087108
4,"(128, 64)",0.0001,0.935897,0.763066
5,"(128, 64)",0.001,0.840678,0.864111
6,"(128, 64)",0.01,0.910256,0.74216
7,"(128, 64)",0.1,0.0,0.0
8,"(64, 32, 32)",0.0001,0.921659,0.696864
9,"(64, 32, 32)",0.001,0.789969,0.878049


# My final mark is $\min(mark, 10)$