In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from cycler import cycler

In [2]:
line_cycler   = (cycler(color=["#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", "#F0E442"]) +
                 cycler(linestyle=["-", "--", "-.", ":", "-", "--", "-."]))
marker_cycler = (cycler(color=["#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7", "#F0E442"]) +
                 cycler(linestyle=["none", "none", "none", "none", "none", "none", "none"]) +
                 cycler(marker=["4", "2", "3", "1", "+", "x", "."]))
# matplotlib's standard cycler
standard_cycler = cycler("color", ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"])
plt.rc("axes", prop_cycle=line_cycler)
plt.rc("text", usetex=True)
plt.rc("text.latex", preamble=r"\usepackage{amssymb}\usepackage{mathtools}")
plt.rc("font", family="serif", size=18.)
plt.rc("savefig", dpi=500)
plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5)
plt.rc("lines", linewidth=2.5, markersize=4, markeredgewidth=2.5)

In [12]:
x_imp = pd.read_csv("train_healthy_knn_5_imp.csv", index_col=0)

In [13]:
x_imp.shape

(6266, 31992)

In [17]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(x_imp)
print("scaled data")
pca = PCA(n_components=0.9)  # n_components=0.9 to keep 90% of the variance
pca.fit(scaled_data)

scaled data


In [21]:
pca.components_.shape

(196, 31992)

In [2]:
cols = pd.read_csv("train_healthy_knn_5_imp.csv", index_col = 0, nrows=1).columns.values

In [29]:
cols

array(['cg00688681', 'cg02295369', 'cg04494122', ..., 'ch.22.740407F',
       'ch.22.1008279F', 'ch.22.47579720R'], dtype=object)

In [30]:
del x_imp

In [3]:
merged_df = pd.read_csv("merged.csv")

In [33]:
del scaled_data

In [38]:
buch_cols = merged_df[cols]

In [39]:
del merged_df

In [41]:
buch_cols_s = scaler.transform(buch_cols)

In [43]:
buch_cols_p = pca.transform(buch_cols_s)

In [45]:
pd.DataFrame(buch_cols_p).to_csv("buchinger_pca.csv")

In [46]:
buch_cols

Unnamed: 0,cg00688681,cg02295369,cg04494122,cg09513996,cg10856819,cg13192155,cg14318199,cg17214408,cg17268992,cg17456701,...,ch.22.26167205F,ch.22.317144R,ch.22.26442001R,ch.22.467397R,ch.22.31817810F,ch.22.33863861F,ch.22.38010425R,ch.22.740407F,ch.22.1008279F,ch.22.47579720R
0,0.237289,0.117823,0.638707,0.284063,0.002886,0.115399,0.007550,0.409340,0.557108,0.921108,...,0.045937,0.074015,0.065803,0.028266,0.060408,0.147614,0.064248,0.058718,0.030048,0.108357
1,0.268158,0.136739,0.622015,0.233920,0.031529,0.115317,0.023588,0.418656,0.568253,0.916995,...,0.030603,0.052335,0.092850,0.042892,0.049225,0.141858,0.065348,0.036819,0.013334,0.133657
2,0.263166,0.121512,0.591656,0.225106,0.020310,0.130974,0.018188,0.445719,0.534823,0.915566,...,0.039739,0.021785,0.050854,0.037396,0.052993,0.130030,0.073616,0.046135,0.019975,0.118176
3,0.274452,0.153136,0.595906,0.412987,0.015838,0.161443,0.015465,0.504099,0.498049,0.908437,...,0.035305,0.064450,0.080036,0.020052,0.039950,0.141063,0.068990,0.057666,0.031082,0.125805
4,0.248359,0.146066,0.600211,0.465820,0.021285,0.132667,0.019179,0.512268,0.463389,0.922879,...,0.052127,0.052625,0.058023,0.041580,0.062061,0.151300,0.073034,0.073549,0.021390,0.102216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.152081,0.081744,0.634388,0.516038,0.026557,0.201629,0.032332,0.488463,0.463261,0.913700,...,0.033585,0.040150,0.081158,0.032957,0.047990,0.141675,0.068255,0.056270,0.021130,0.119665
92,0.172127,0.110526,0.656162,0.494210,0.030341,0.193665,0.029415,0.486244,0.454376,0.922508,...,0.043855,0.077456,0.061816,0.042460,0.058689,0.136713,0.075841,0.062887,0.020745,0.134128
93,0.167169,0.124010,0.633662,0.376902,0.033654,0.121437,0.021885,0.630039,0.446144,0.915309,...,0.040457,0.099893,0.097123,0.036767,0.053061,0.155454,0.084384,0.071154,0.016274,0.135590
94,0.115589,0.089906,0.618149,0.420153,0.023806,0.140588,0.018740,0.593071,0.467666,0.914239,...,0.064405,0.077612,0.102684,0.050756,0.064050,0.143063,0.076182,0.066282,0.021774,0.130048


In [47]:
buch_cols

Unnamed: 0,cg00688681,cg02295369,cg04494122,cg09513996,cg10856819,cg13192155,cg14318199,cg17214408,cg17268992,cg17456701,...,ch.22.26167205F,ch.22.317144R,ch.22.26442001R,ch.22.467397R,ch.22.31817810F,ch.22.33863861F,ch.22.38010425R,ch.22.740407F,ch.22.1008279F,ch.22.47579720R
0,0.237289,0.117823,0.638707,0.284063,0.002886,0.115399,0.007550,0.409340,0.557108,0.921108,...,0.045937,0.074015,0.065803,0.028266,0.060408,0.147614,0.064248,0.058718,0.030048,0.108357
1,0.268158,0.136739,0.622015,0.233920,0.031529,0.115317,0.023588,0.418656,0.568253,0.916995,...,0.030603,0.052335,0.092850,0.042892,0.049225,0.141858,0.065348,0.036819,0.013334,0.133657
2,0.263166,0.121512,0.591656,0.225106,0.020310,0.130974,0.018188,0.445719,0.534823,0.915566,...,0.039739,0.021785,0.050854,0.037396,0.052993,0.130030,0.073616,0.046135,0.019975,0.118176
3,0.274452,0.153136,0.595906,0.412987,0.015838,0.161443,0.015465,0.504099,0.498049,0.908437,...,0.035305,0.064450,0.080036,0.020052,0.039950,0.141063,0.068990,0.057666,0.031082,0.125805
4,0.248359,0.146066,0.600211,0.465820,0.021285,0.132667,0.019179,0.512268,0.463389,0.922879,...,0.052127,0.052625,0.058023,0.041580,0.062061,0.151300,0.073034,0.073549,0.021390,0.102216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.152081,0.081744,0.634388,0.516038,0.026557,0.201629,0.032332,0.488463,0.463261,0.913700,...,0.033585,0.040150,0.081158,0.032957,0.047990,0.141675,0.068255,0.056270,0.021130,0.119665
92,0.172127,0.110526,0.656162,0.494210,0.030341,0.193665,0.029415,0.486244,0.454376,0.922508,...,0.043855,0.077456,0.061816,0.042460,0.058689,0.136713,0.075841,0.062887,0.020745,0.134128
93,0.167169,0.124010,0.633662,0.376902,0.033654,0.121437,0.021885,0.630039,0.446144,0.915309,...,0.040457,0.099893,0.097123,0.036767,0.053061,0.155454,0.084384,0.071154,0.016274,0.135590
94,0.115589,0.089906,0.618149,0.420153,0.023806,0.140588,0.018740,0.593071,0.467666,0.914239,...,0.064405,0.077612,0.102684,0.050756,0.064050,0.143063,0.076182,0.066282,0.021774,0.130048


In [84]:
def beta_to_m(x):
    res = np.log(x/(1-x))
    if res == -np.inf:
        return np.log((x + 0.0000001)/(1-x + 0.0000001))
    return res

In [85]:
buch_cols_m = buch_cols.applymap(beta_to_m)

  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x/(1-x))
  res = np.log(x

In [86]:
(buch_cols_m == -np.inf).sum().sum()

0

In [87]:
buch_cols_m.to_csv("buch_cols_m.csv")

In [88]:
buch_cols_s = scaler.transform(buch_cols_m)
buch_cols_p = pca.transform(buch_cols_s)

In [89]:
pd.DataFrame(buch_cols_p).to_csv("buch_cols_p.csv")