In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from collections import Counter

# Data

This data is actually related to the elastic strain engineering task: given the (modified) deformation tensor, predict, whether the tiny diamond crystal under this deformation is a direct-bandgap semiconductor, or not.

I will save you some time and process the data for you.

In [2]:
df = pd.read_csv('c_gw_direct.csv')

In [3]:
df.shape

(9766, 7)

In [4]:
df

Unnamed: 0,exx,exy,exz,eyz,eyy,ezz,is_direct
0,-0.049931,0.019480,-0.012549,0.071484,0.000141,0.036242,False
1,0.057604,0.070671,-0.010140,-0.063918,0.032494,-0.078462,False
2,-0.020174,-0.059848,0.017878,0.077713,-0.087613,-0.079651,False
3,-0.023523,-0.026120,0.045459,-0.035286,-0.000705,0.038353,False
4,-0.050925,-0.006037,-0.042681,-0.016254,-0.019372,-0.037559,False
...,...,...,...,...,...,...,...
9761,-0.096821,0.049679,-0.065829,0.060488,0.008918,0.006889,False
9762,-0.074071,0.027148,-0.056629,-0.068732,0.070384,-0.008722,False
9763,-0.061505,0.015496,0.077352,-0.037589,-0.047513,-0.045182,False
9764,-0.037760,0.037480,0.002881,-0.059958,-0.047096,0.060803,False


In [5]:
df.is_direct.sum()

1376

In [6]:
# Getting arrays from table
X = df[['exx', 'exy', 'exz', 'eyz', 'eyy', 'ezz']].values
y = df['is_direct'].ravel()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=124)
Counter(y_train)

Counter({False: 6723, True: 1089})

# (3 pts) Train and evaluate a simple neural network classifier

- Train a model
- Calculate its precision, recall, f1-score
- Take a look at the confusion matrix

### Train a model

In [25]:
# your code starts here...
model = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

### Look at the metrics

In [27]:
predictions = model.predict(X_test)

precision = precision_score(y_test, predictions)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions)
print(f'Recall: {recall}')

Precision: 0.8384879725085911
Recall: 0.8501742160278746


### Print the confusion matrix

In [28]:
confusion_matrix(y_test, predictions)

array([[1620,   47],
       [  43,  244]])

# (3 pts) Use an undersampling technique to balance classes
- Balance training sample
- Train new model with the same hyperparameters as before
- Evaluate its metrics. Is is better than the original model?

In [33]:
idx_for_larger_class = np.nonzero(y_train == False)[0]
idx_for_smaller_class = np.nonzero(y_train == True)[0]

less_idx_for_larger_class = np.random.choice(idx_for_larger_class,
                                            len(idx_for_smaller_class),
                                            replace=False)

new_idx = list(less_idx_for_larger_class) + list(idx_for_smaller_class)

X_train_new = X_train[new_idx, :]
y_train_new = y_train[new_idx]

model_balanced = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model_balanced.fit(X_train_new, y_train_new)


MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

In [34]:
predictions_balanced = model_balanced.predict(X_test)

precision = precision_score(y_test, predictions_balanced)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions_balanced)
print(f'Recall: {recall}')

Precision: 0.5227272727272727
Recall: 0.9616724738675958


In [35]:
confusion_matrix(y_test, predictions_balanced)

array([[1415,  252],
       [  11,  276]])

## Result:
The random-balanced classes doesn't give us better model.

# (3 pts) Try the imblearn package and its undersampling methods
https://imbalanced-learn.org/stable/under_sampling.html
- Try NearMiss, NeighbourhoodCleaningRule, and EditedNearestNeighbours methods
- Do they perform better than the random undersampling?

### You may need to reinstall sklearn if it is old

In [None]:
# !pip uninstall -v scikit-learn -y

In [None]:
# !pip install -v scikit-learn

In [None]:
# !pip install imblearn

In [36]:
from imblearn.under_sampling import NearMiss, EditedNearestNeighbours, NeighbourhoodCleaningRule

In [37]:
nm1 = NearMiss(version=1)
X_resampled_nm1, y_resampled = nm1.fit_resample(X_train, y_train)

In [38]:
model_balanced_nearmiss = MLPClassifier(hidden_layer_sizes=(50, 50, ), max_iter=1000)
model_balanced_nearmiss.fit(X_train_new, y_train_new)

MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000)

In [39]:
predictions_balanced_nearmiss = model_balanced_nearmiss.predict(X_test)

precision = precision_score(y_test, predictions_balanced_nearmiss)
print(f'Precision: {precision}')

recall = recall_score(y_test, predictions_balanced_nearmiss)
print(f'Recall: {recall}')

Precision: 0.28398058252427183
Recall: 0.8153310104529616


# (3 pts) Perform hyperparameter tuning
- Run a cycle through some hyperparater settings in order to find the best ones
- E.g. hidden layer sizes, alpha regularization ...
- You need to be able to outperform your initial model
- I do not care whether you use balanced or imbalanced training set

In [None]:
# (some cycle through archs)

# Your final mark is $max(mark, 10)$