In [None]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

The main purpose of this notebook is to find direction vector from paired dataset of (dlatents, image, label)

# Loading data

In [None]:
df = pd.read_csv('data.tsv', sep='\t')

df['attribute'] = (df.label == 'male').astype(int)
df['dlatents'] = None

results_folder_path = 'results'

In [None]:
for i in tqdm(range(df.shape[0])):
    key = df.key.iloc[i]
    try:
        df.dlatents.iloc[i] = np.load(f'{results_folder_path}/dlatents/{key}.npy')
    except:
        pass

In [None]:
df = df.dropna(subset=['dlatents'])

df.attribute.value_counts()

In [None]:
X_data = np.vstack(df.dlatents.values)
y_data = df.attribute.values

### Evaluating model performance

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

In [None]:
%%time
clf = SGDClassifier('log')
scores = cross_val_score(clf, X_data, y_data, scoring='accuracy', cv=5)
clf.fit(X_data, y_data)

print(scores)
print('Mean: ', np.mean(scores))

#### Dependency of accuracy on training data size

In [None]:
%%time

nb_folds = 5
splits = 20
scores = np.zeros((splits, nb_folds))
dataset_size = list()

for fold_id, (train_idx, test_idx) in enumerate(StratifiedKFold(nb_folds, True, 42).split(X_data, y_data)):
    np.random.shuffle(train_idx)
    np.random.shuffle(test_idx)

    X_train, X_test = X_data[train_idx], X_data[test_idx]
    y_train, y_test = y_data[train_idx], y_data[test_idx]
    
    for split_id in range(splits):
        nb_samples = int((len(X_train)/splits) * (split_id+1))
        dataset_size.append(nb_samples)
        clf = SGDClassifier('log', class_weight='balanced').fit(X_train[:nb_samples], y_train[:nb_samples])
        scores[split_id][fold_id] = accuracy_score(y_test, clf.predict(X_test))
        

plt.plot(dataset_size[:splits], scores.mean(axis=1))
plt.title('Dependency of accuracy on training data size')
plt.xlabel('Dataset size')
plt.ylabel('Accuracy')
plt.show()

# Visualization of country transformation

In [None]:
clf = LogisticRegression(class_weight='balanced')
# clf.fit(X_data.reshape((-1, 18*512)), y_data)
clf.fit(X_data, y_data)

In [None]:
direction = np.tile(clf.coef_, (18, 1))

In [None]:
np.save('gender_direction.npy', direction)

More details: https://github.com/Puzer/stylegan-encoder/blob/master/Learn_direction_in_latent_space.ipynb