In [14]:
import pandas as pd
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import scipy.stats as spstats
from sklearn.preprocessing import PolynomialFeatures
import re
import gc

In [2]:
# filename_measures = 'data/IMPROVE_2015_measures_cs433.csv'
filename_spectra = 'data/IMPROVE_2015_raw_spectra_cs433.csv'
# filename_tts = 'data/IMPROVE_2015_train_test_split_cs433.csv'
# filename_sec_deriv = 'data/IMPROVE_2015_2nd-derivative_spectra_cs433.csv'

df_spectra_raw = pd.read_csv(filename_spectra)
# df_measures_raw = pd.read_csv(filename_measures)
# df_train_test_split_raw = pd.read_csv(filename_tts)
# df_second_derivative = pd.read_csv(filename_sec_deriv, index_col=0)

In [None]:
meta_cols = ['SiteCode','Date','flag','Latitude','Longitude','DUSTf:Unc']
y_col = ['DUSTf:Value']

## Preparation

In [3]:
df_measures = df_measures_raw.set_index('site')
df_measures = df_measures[meta_cols + y_col]
df_measures.index = pd.Index(df_measures.index, name="")
df_measures.head()

NameError: name 'df_measures_raw' is not defined

In [86]:
df_spectra = df_spectra_raw.T
df_spectra.columns = pd.Float64Index(df_spectra.loc['wavenumber',:], name="")
df_spectra = df_spectra.drop('wavenumber')

sample_names = pd.DataFrame(df_spectra.index.values, index=df_spectra.index, columns=['sample_name'])
sample_names['site_code'] = sample_names.sample_name.replace(to_replace="_.+$",value="", regex=True)
sample_names['sample_type'] = sample_names.sample_name.replace(to_replace=".+_([A-Z]{2}\d?)_.+$", 
                                                               value="\\1", regex=True)
no_type = sample_names[~sample_names.sample_type.isin(['NM','QC','FB','NM2','QC2', 'QD', 'QD2'])].index
sample_names

Unnamed: 0,sample_name,site_code,sample_type
ACAD1_01_06_2015_NM_0_csv,ACAD1_01_06_2015_NM_0_csv,ACAD1,NM
ACAD1_01_09_2015_NM_0_csv,ACAD1_01_09_2015_NM_0_csv,ACAD1,NM
ACAD1_01_12_2015_NM_0_csv,ACAD1_01_12_2015_NM_0_csv,ACAD1,NM
ACAD1_01_15_2015_NM_0_csv,ACAD1_01_15_2015_NM_0_csv,ACAD1,NM
ACAD1_01_18_2015_NM_0_csv,ACAD1_01_18_2015_NM_0_csv,ACAD1,NM
ACAD1_01_21_2015_NM_0_csv,ACAD1_01_21_2015_NM_0_csv,ACAD1,NM
ACAD1_01_24_2015_NM_0_csv,ACAD1_01_24_2015_NM_0_csv,ACAD1,NM
ACAD1_01_27_2015_NM_0_csv,ACAD1_01_27_2015_NM_0_csv,ACAD1,NM
ACAD1_01_30_2015_NM_0_csv,ACAD1_01_30_2015_NM_0_csv,ACAD1,NM
ACAD1_02_02_2015_NM_0_csv,ACAD1_02_02_2015_NM_0_csv,ACAD1,NM


In [87]:
np.logspace(-7, -3, 10)

array([1.00000000e-07, 2.78255940e-07, 7.74263683e-07, 2.15443469e-06,
       5.99484250e-06, 1.66810054e-05, 4.64158883e-05, 1.29154967e-04,
       3.59381366e-04, 1.00000000e-03])

## Dataframes merging

In [None]:
merged = pd.merge(df_spectra, df_measures, left_index=True, right_index=True)
merged.head()

Note that this merging has as incident effect to ensure only data with measures and vice-versa are kept. This removes unwanted rows (about 2k measure rows).

## Memory cleaning

At this point, the original dataframes are not useful anymore. Indeed, everything is contained in the `merged` dataframe. We can safely delete the former ones.

In [None]:
%xdel df_measures
%xdel df_measures_raw
%xdel df_spectra
%xdel df_spectra_raw
gc.collect()
gc.collect()

## Exploration

In [None]:
merged[merged['DUSTf:Value'].isnull()]

There are 7 NaN values in the dust values. We remove them as they are totally useless.

In [None]:
nan_indices = merged['DUSTf:Value'].index[merged['DUSTf:Value'].apply(np.isnan)]
nan_indices

In [None]:
merged.drop(nan_indices, inplace=True)

## Test/train separation

In [None]:
merged.isna().any().any()

In [None]:
train = df_train_test_split_raw[df_train_test_split_raw.usage == "calibration"].site
test = df_train_test_split_raw[df_train_test_split_raw.usage == "test"].site
merged_train = merged.loc[np.isin(merged.index, train)]
merged_test = merged.loc[np.isin(merged.index, test)]
%xdel merged
%xdel train
%xdel test

## X,y creation

In [None]:
X = merged_train.loc[:, [x for x in merged_train.columns if x not in y_col and x not in meta_cols]]
y = merged_train[y_col]

## Features selection

In [None]:
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression, SelectKBest

In [None]:
test = SelectKBest(score_func=f_regression, k=30)
test.fit(X,np.ravel(y))
selected_cols = X.columns[test.get_support()]

In [None]:
pf = PolynomialFeatures(degree=4, interaction_only=False, include_bias=False)
new_features = pd.DataFrame(pf.fit_transform(X[selected_cols]), index=X.index)


In [None]:
print(new_features.shape, X.shape)

In [None]:
# X[X.columns[~np.isin(X.columns, selected_cols)]]

In [None]:
X = pd.concat([X[X.columns[~np.isin(X.columns, selected_cols)]], new_features], axis=1)
%xdel new_features

In [None]:
# import sys

# These are the usual ipython objects, including this one you are creating
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
# sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [None]:
print(X.shape, y.shape)

From here, we can use `X` as the data matrix and `y` as the validation vector.