# COSMO Project
By Mathilde Raynal, Etienne Bonvin and Xavier Pantet

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import random
from regressions import *
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [3]:
DATA_FOLDER = "data/"
X = np.load(DATA_FOLDER + "feature_mat_radial_compression.npy")
y = np.load(DATA_FOLDER + "CSD500-r_train-H_total.npy")

In [4]:
print("X: " + str(X.shape))
print("y: " + str(y.shape))

X: (30049, 15961)
y: (30049,)


In [5]:
x_df = pd.DataFrame(X)

In [6]:
x_df = x_df.drop_duplicates()
print("X: " + str(x_df.shape))

X: (30049, 15961)


In [7]:
def split(x_df, y, perc):
    train_set_size = int(x_df.shape[0] * 0.75)
    x_tr = x_df.head(train_set_size)
    x_te = x_df.tail(int(x_df.shape[0] - train_set_size))
    y_tr = y[: train_set_size]
    y_te = y[train_set_size :]
    return x_tr, y_tr, x_te, y_te

In [8]:
#To remove intercept
def add_cte_col(df):
    df_tmp = df.copy()
    df_tmp[df_tmp.shape[1]] = pd.Series(np.ones(df_tmp.shape[0]), index=df_tmp.index)
    return df_tmp

In [9]:
def test_quality(x_df, y):
    x_df = add_cte_col(x_df)
    test_perc = 0.75
    x_train, y_train, x_test, y_test = split(x_df, y, test_perc)
    best = 100
    for lambda_ in [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9]:
        err = rmse(y_test, x_test, ridge_regression(y_train, x_train, lambda_))
        best = err if err < best else best
    return best

In [13]:
test_quality(x_df, y)

0.7373143070214498

We can see that a simple ridge regression with polynomial expansion gives us an RMSE of 0.73. This will be the value that we consider as the upper bound for the upcoming tests on the techniques we try.

### Data augmentation

In [10]:
#to have more samples
def add_jitter(df, y, perc_sample, perc_col):
    means = [df[j].mean() for j in df]
    df_tmp = df.copy()
    y_tmp = y.copy()
    ids = random.sample(range(df_tmp.shape[0]), int(df_tmp.shape[0]*perc_sample))
    for id_ in ids:
        new_sample = df_tmp.iloc[[id_]].copy()
        col = random.sample(range(df_tmp.shape[1]), int(df_tmp.shape[1]*perc_col))
        for j in col:
            new_sample[j] = new_sample[j] + 0.01*means[j]
        df_tmp = df_tmp.append(new_sample, ignore_index=True)
        y_tmp = np.append(y_tmp, y_tmp[id_])
    return df_tmp, y_tmp        

In [15]:
x_df_aug, y_aug = add_jitter(x_df, y, 0.01, 0.01)

In [16]:
test_quality(x_df_aug, y_aug)

0.719821608051507

We can see that adding 1% of samples generated from existing ones lowers the RMSE to 0.72. This is some improvement !

### Normalization:

In [11]:
x_df=(x_df-x_df.mean())/x_df.std()
x_df = x_df.drop(15960, axis=1)
x_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15950,15951,15952,15953,15954,15955,15956,15957,15958,15959
0,-1.117537,0.130346,0.013696,-0.103711,0.037901,0.269922,0.399752,0.415404,0.358259,0.303114,...,-0.393410,-0.301521,-0.367537,-0.394536,-0.370814,-0.407677,-0.416760,-0.401391,-0.409848,-0.426182
1,-1.141154,0.360481,0.266127,0.079885,0.149372,0.394671,0.586697,0.697098,0.761406,0.658471,...,-0.439485,-0.473423,-0.539600,-0.589093,-0.570107,-0.601153,-0.609641,-0.615344,-0.617178,-0.622111
2,-1.177216,-0.267615,-0.349863,-0.416966,-0.393240,-0.265429,-0.145420,-0.109582,-0.151950,-0.193078,...,-0.430591,-0.422903,-0.480570,-0.536433,-0.519772,-0.540244,-0.550040,-0.558432,-0.558878,-0.562048
3,-1.125873,0.708498,0.511915,0.345341,0.678018,0.960304,0.994918,0.879390,0.731289,0.780614,...,-0.458453,-0.547680,-0.616144,-0.673036,-0.654511,-0.685598,-0.695242,-0.707149,-0.706268,-0.709202
4,-1.301681,0.035031,-0.075285,-0.190359,-0.102609,0.108780,0.216456,0.411433,0.163525,0.113291,...,-0.494304,-0.691341,-0.765498,-0.832934,-0.815301,-0.849757,-0.864321,-0.889805,-0.880755,-0.883626
5,-1.289225,0.618264,0.440396,0.333169,0.531440,0.795734,0.815010,0.934647,0.583534,0.673116,...,-0.493536,-0.691717,-0.767147,-0.832168,-0.814264,-0.850489,-0.864632,-0.889108,-0.880674,-0.884081
6,-1.280896,-0.263859,-0.355730,-0.418794,-0.397849,-0.229321,-0.186046,-0.037326,-0.213264,-0.233464,...,-0.490703,-0.694423,-0.769091,-0.828314,-0.812143,-0.851598,-0.864812,-0.884870,-0.880759,-0.885678
7,-1.244046,0.480159,0.363812,0.202020,0.349861,0.564196,0.670118,0.849204,0.574173,0.503616,...,-0.484862,-0.684433,-0.766651,-0.814271,-0.794596,-0.843836,-0.857055,-0.869302,-0.864939,-0.877402
8,-1.121709,0.117122,0.008087,-0.111065,0.043313,0.242077,0.372759,0.408476,0.373977,0.232905,...,-0.388380,-0.487897,-0.678482,-0.671614,-0.536393,-0.580206,-0.699196,-0.727652,-0.622976,-0.591967
9,-1.120282,0.673164,0.520629,0.353339,0.604866,0.836481,0.911321,0.919284,0.833911,0.680132,...,-0.430117,-0.472262,-0.542216,-0.584586,-0.566216,-0.587505,-0.606463,-0.627249,-0.609860,-0.606361


### Correlation:

In [18]:
to_rm = []
for i in [i for i in x_df if i < 50]:
    for j in [j for j in x_df if j > i]:
        if x_df[i].corr(x_df[j]) > 0.95:
            to_rm.append(j)
x_df_uncorr = x_df.drop(to_rm, axis=1)

In [19]:
print(x_df_uncorr.shape)
test_quality(x_df_uncorr, y)

(30049, 15915)


1.1669052808672136

Those results are surprising as we were expecting the error to go down by removing some noise, but we can see that correlation actually make the results worse. We won't use it.

### PCA

In [20]:
rmses = []
ys = range(500, 5501, 500)
for i in ys:
    pca = PCA(n_components=i, whiten=True)
    principalComponents = pca.fit_transform(x_df, x_df.shape)
    principalDf = pd.DataFrame(data = principalComponents
                 , columns = range(principalComponents.shape[1]))
    err = test_quality(principalDf, y)
    rmses.append(err)
    print(i, err)

500 1.216608284379181
1000 1.0752640301349632
1500 1.0299223794959569
2000 0.9976506897201758
2500 0.986252942323835
3000 1.0100353677409406
3500 0.9924759766051255
4000 0.9531749678020832
4500 0.9460157221396037


KeyboardInterrupt: 

In [None]:
plt.xlabel("Number of components")
plt.ylabel("RMSE")
plt.plot(ys, rmses)
plt.show()

The error plotted is relatively high when reducing the number of components below 6000. We are not sure of the conlusion to give here, as we were expecting results less good then the full matrix but not to that extend. We can argue that our `test_quality` function is absolutely not optimized, but after a look at the scientific side, 1000 features should be enough to get good predictions. We then have to choose by default the best one of the results.

In [None]:
i_star = 3003
pca = PCA(n_components=i_star, whiten=True)
principalComponents = pca.fit_transform(x_df, x_df.shape)
principalDf_star = pd.DataFrame(data = principalComponents
             , columns = range(principalComponents.shape[1]))
test_quality(principalDf_star, y)

In [None]:
i_star = 3003
pca = PCA(n_components=i_star)
principalComponents_no_w = pca.fit_transform(x_df, x_df.shape)
principalDf_star_no_w = pd.DataFrame(data = principalComponents_no_w
             , columns = range(principalComponents_no_w.shape[1]))
test_quality(principalDf_star_no_w, y)

Whitening does not improve anything, even worsen the results.

In [18]:
#ValueError: math domain error is known bug : https://github.com/scikit-learn/scikit-learn/issues/10217: Cannot use MLE

The disapointement is real not to be able to use Maximum Likelyhood Estimator.

### Preparing the data for Machine Learning

In [12]:
DATA_FOLDER = "data/"
X = np.load(DATA_FOLDER + "feature_mat_radial_compression.npy")
y = np.load(DATA_FOLDER + "CSD500-r_train-H_total.npy")
x_df = pd.DataFrame(X)

In [13]:
#Apply PCA
i_star = 4500
pca = PCA(n_components=i_star)
principalComponents = pca.fit_transform(x_df, x_df.shape)
x_pca_df = pd.DataFrame(data = principalComponents
             , columns = range(principalComponents.shape[1]))
x_pca_df.columns = range(x_pca_df.shape[1])

In [14]:
#Add jitter
x_with_jitter_df, y_with_jitter = add_jitter(x_pca_df, y, 0.01, 0.01)

In [15]:
#Normalize
x_with_jitter_df=(x_with_jitter_df-x_with_jitter_df.mean())/x_with_jitter_df.std()

In [17]:
np.save("data/ML/x_train.npy", x_with_jitter_df)
np.save("data/ML/y.npy", y_with_jitter)

### Preparing the data for Deep Learning

In [None]:
DATA_FOLDER = "data/"
X = np.load(DATA_FOLDER + "feature_mat_radial_compression.npy")
y = np.load(DATA_FOLDER + "CSD500-r_train-H_total.npy")
x_df = pd.DataFrame(X)

In [None]:
#Apply PCA
i_star = 3004
pca = PCA(n_components=i_star)
principalComponents = pca.fit_transform(x_df, x_df.shape)
x_pca_df = pd.DataFrame(data = principalComponents
             , columns = range(principalComponents.shape[1]))
x_pca_df.columns = range(x_pca_df.shape[1])

In [None]:
#Split the data 75-25
x_train_df, y_train, x_test_df, y_test = split(x_pca_df, y, 0.75)

In [None]:
#Add jitter
train_df_with_jitter, y_with_jitter = add_jitter(x_train_df, y_train, 0.01, 0.01)

In [None]:
#Normalize
train_df_with_jitter=(train_df_with_jitter-train_df_with_jitter.mean())/train_df_with_jitter.std()
test_df =(x_test_df-x_test_df.mean())/x_test_df.std()

In [None]:
np.save("data/DL/x_train.npy", train_df_with_jitter)
np.save("data/DL/x_test.npy", x_test_df)
np.save("data/DL/y_train.npy", y_with_jitter)
np.save("data/DL/y_test.npy", y_test)