In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

SCALING_WEIGHTS = [100/15, 100/8, 100/100]

In [2]:
!gdown 1IAZw8lzHB7BiENiJxBRYkvgXEPmeS0Gu

Downloading...
From: https://drive.google.com/uc?id=1IAZw8lzHB7BiENiJxBRYkvgXEPmeS0Gu
To: /content/ml_data_onsite_start.pickle
100% 15.7M/15.7M [00:00<00:00, 35.0MB/s]


In [3]:
data = pd.read_pickle('ml_data_onsite_start.pickle')

X_train = data['X']['train']
y_train = data['y']['train']

X_val = data['X']['val']
y_val = data['y']['val']

X_test = data['X']['live_test']

In [None]:
def vis(arr):
  plt.figure(figsize=(8, 8))

  cnt = 1
  for z in range(5):
    for q in range(6):
      plt.subplot(5, 6, cnt)
      plt.imshow(arr[:, :, z, q], vmin=-40, vmax=40, cmap='hsv')
      plt.grid()
      plt.axis('off')
      cnt += 1
  plt.tight_layout()
vis(X_train[0])

In [11]:
def test_solution(X_train, y_train, X_val, y_val, feature_num=0):
    assert X_train.shape[-1] <= 300, "Too many features! Should be less than 300"
    assert X_val.shape[-1] <= 300, "Too many features! Should be less than 300"

    model =  LinearRegression().fit(
        X_train,
        y_train[:, feature_num]
    )
    predictions = model.predict(X_val)
    rmse = mean_squared_error(
        predictions,
        y_val[:, feature_num]
    )**.5
    normalized_rmse = rmse * SCALING_WEIGHTS[feature_num]
    print(f"Property #{feature_num}:    raw RMSE={rmse:.6f}")
    print(f"Property #{feature_num}: scaled RMSE={normalized_rmse:.6f}")
    return round(normalized_rmse, 6)

In [5]:
def generate_predictions(X_train, y_train, X_test, feature_num=0):
    assert X_train.shape[-1] <= 300
    assert X_test.shape[-1] <= 300

    model =  LinearRegression().fit(
        X_train,
        y_train[:, feature_num]
    )
    predictions = model.predict(X_test)
    return predictions

In [6]:
# load the test dataset
!gdown 1K_KNlfIuusgQjeN3gWJ2htutbO2iX7YE
loaded = pd.read_pickle("ml_data_onsite_final_test.pickle")
X_test_final = loaded['X']['final_test']

Downloading...
From: https://drive.google.com/uc?id=1K_KNlfIuusgQjeN3gWJ2htutbO2iX7YE
To: /content/ml_data_onsite_final_test.pickle
100% 14.3M/14.3M [00:00<00:00, 68.4MB/s]


In [7]:
def construct_features(Xs):
  from itertools import combinations

  all_features = []
  for x in Xs:
    cur_features = []
    for q in range(6):
      features_q = [
          x[:,:,:, q].mean(),
          x[:,:,:, q].max(),
          x[:,:,:, q].min(),
          x[:,:,:, q].std(),
          x[:,:,:, q].max() - x[:,:,:, q].min()
      ]
      cur_features.append(features_q)
    all_features.append(np.stack(cur_features).ravel())

  features = np.stack(all_features)

  comb_features = []

  for comb in combinations(range(30),r=2):
    cur_features = np.concatenate([
        (features[:, comb[0]] - features[:, comb[1]]).reshape(-1,1),
        (features[:, comb[0]] + features[:, comb[1]]).reshape(-1,1),
        (features[:, comb[0]] * features[:, comb[1]]).reshape(-1,1),
        (features[:, comb[0]] / features[:, comb[1]]).reshape(-1,1)
    ],axis=1)
    comb_features.append(cur_features)

  comb_features = np.concatenate(comb_features,axis=1)

  all_features = np.concatenate([all_features, comb_features],axis=1)

  all_features[np.isnan(all_features)] = 0

  all_features[all_features == np.inf] = 1000
  all_features[all_features == -np.inf] = -1000

  return all_features

In [8]:
X_train_c = construct_features(X_train)
X_val_c = construct_features(X_val)

  (features[:, comb[0]] / features[:, comb[1]]).reshape(-1,1)


In [12]:
features_results = []


for feature_num in range(3):
  features_results_cur = {}
  for i in range(200):
    idxs = np.random.choice(range(1770), size=(100, ), replace=False)
    X_train_cur = X_train_c[:, idxs]
    X_val_cur = X_val_c[:, idxs]

    score = test_solution(
        X_train_cur,
        y_train,
        X_val_cur,
        y_val,
        feature_num=feature_num
    )
    print('='*16)
    print(f"feature num {feature_num}, score = {score:.6f}")
    features_results_cur[score] = idxs
  features_results.append(features_results_cur)

Property #0:    raw RMSE=0.334912
Property #0: scaled RMSE=2.232748
feature num 0, score = 2.232748
Property #0:    raw RMSE=0.250723
Property #0: scaled RMSE=1.671488
feature num 0, score = 1.671488
Property #0:    raw RMSE=0.271950
Property #0: scaled RMSE=1.812998
feature num 0, score = 1.812998
Property #0:    raw RMSE=0.259486
Property #0: scaled RMSE=1.729908
feature num 0, score = 1.729908
Property #0:    raw RMSE=0.257143
Property #0: scaled RMSE=1.714285
feature num 0, score = 1.714285
Property #0:    raw RMSE=0.320449
Property #0: scaled RMSE=2.136324
feature num 0, score = 2.136324
Property #0:    raw RMSE=0.319993
Property #0: scaled RMSE=2.133288
feature num 0, score = 2.133288
Property #0:    raw RMSE=0.369567
Property #0: scaled RMSE=2.463781
feature num 0, score = 2.463781
Property #0:    raw RMSE=0.364235
Property #0: scaled RMSE=2.428231
feature num 0, score = 2.428231
Property #0:    raw RMSE=0.275980
Property #0: scaled RMSE=1.839865
feature num 0, score = 1.839865


In [13]:
X_train_c_0 = X_train_c[:, features_results[0][min(features_results[0])]]
X_train_c_1 = X_train_c[:, features_results[1][min(features_results[1])]]
X_train_c_2 = X_train_c[:, features_results[2][min(features_results[2])]]

X_val_c_0 = X_val_c[:, features_results[0][min(features_results[0])]]
X_val_c_1 = X_val_c[:, features_results[1][min(features_results[1])]]
X_val_c_2 = X_val_c[:, features_results[2][min(features_results[2])]]

In [14]:
total_score = 0
for feature_number in range(3):
  total_score += test_solution(
      [X_train_c_0, X_train_c_1, X_train_c_2][feature_number],
      y_train,
      [X_val_c_0, X_val_c_1, X_val_c_2][feature_number],
      y_val,
      feature_num=feature_number
  )
  print()
total_score /= 3
print('='*16)
print(f"Total score = {total_score:.6f}")

Property #0:    raw RMSE=0.205094
Property #0: scaled RMSE=1.367290

Property #1:    raw RMSE=0.099386
Property #1: scaled RMSE=1.242330

Property #2:    raw RMSE=1.103868
Property #2: scaled RMSE=1.103868

Total score = 1.237829


# Author solution

- Replicate the data 5 more times, swapping axes (number of permutations for three axes)

- (Optionally) Delete duplicate columns

- Use PCA, it helps

- (Author's knowledge) One of the features is from the previous task, gives biggest improvement, so home-task knowledge definitely helps

Most successful teams combined some replication(=augmentation), features from home task, and PCA. Validation set should be a good indicator whether particular feature is a good idea to add.

In [15]:
def symmetrize_x(X_tr, y_tr):
    xxx = [
        X_tr,
        X_tr.swapaxes(1, 2),
        X_tr.swapaxes(1, 3),
        X_tr.swapaxes(2, 3),
        X_tr.swapaxes(1, 3).swapaxes(1, 2),
        X_tr.swapaxes(1, 3).swapaxes(2, 3),
    ]
    return np.concatenate(xxx), np.vstack([y_tr]*6)

In [16]:
def ravelize(X):
    return X.reshape((X.shape[0], -1))

In [17]:
def make_bg(X):
    bgs = []
    for _x in X:
        bg = _x[:,:,:,3].ravel().min() - _x[:,:,:,2].ravel().max()
        bgs.append(bg)
    bgs = np.array(bgs)
    return bgs[:, None]

In [None]:
X_train_symm, y_train_symm = symmetrize_x(X_train, y_train)

In [None]:
# X_train_bg = make_bg(X_train_symm)
# X_val_bg = make_bg(X_val)
X_train_c = construct_features(X_train_symm)
X_val_c = construct_features(X_val)
best_idxs = features_results[min(features_results)]
X_train_c = X_train_c[:, best_idxs]
X_val_c = X_val_c[:, best_idxs]

  (features[:, comb[0]] / features[:, comb[1]]).reshape(-1,1)


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(ravelize(X_train_symm))
X_val_pca = pca.transform(ravelize(X_val))
# X_train_pca = pca.fit_transform(X_train_c)
# X_val_pca = pca.transform(X_val_c)

In [None]:
%%time
total_score = 0
for feature_number in range(3):
  total_score += test_solution(
      X_train_pca,
      y_train_symm,
      X_val_pca,
      y_val,
      feature_num=feature_number
  )
  print()
total_score /= 3
print('='*16)
print(f"Total score = {total_score:.6f}")

Property #0:    raw RMSE=0.305542
Property #0: scaled RMSE=2.036950

Property #1:    raw RMSE=0.596131
Property #1: scaled RMSE=7.451637

Property #2:    raw RMSE=1.631475
Property #2: scaled RMSE=1.631475

Total score = 3.666667
CPU times: user 3.01 s, sys: 20.7 ms, total: 3.03 s
Wall time: 2.59 s


In [None]:
%%time
total_score = 0
# something close to the best possible solution
for feature_number in range(3):
  total_score += test_solution(
      np.concatenate([X_train_pca, X_train_c], axis=-1),
      # X_train_bg,
      y_train_symm,
      # X_val_bg,
      np.concatenate([X_val_pca, X_val_c], axis=-1),
      y_val,
      feature_num=feature_number
  )
  print()
total_score /= 3
print('='*16)
print(f"Total score = {total_score:.6f}")

Property #0:    raw RMSE=0.145149
Property #0: scaled RMSE=0.967662

Property #1:    raw RMSE=0.101555
Property #1: scaled RMSE=1.269434

Property #2:    raw RMSE=3.093128
Property #2: scaled RMSE=3.093128

Total score = 1.666667
CPU times: user 3.68 s, sys: 91.1 ms, total: 3.77 s
Wall time: 3.05 s


In [None]:
np.concatenate([X_train_pca, X_train_c], axis=-1).shape

(12144, 1870)