In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as spstats
from sklearn.preprocessing import PolynomialFeatures
%matplotlib notebook

In [2]:
filename_measures = 'data/IMPROVE_2015_measures_cs433.csv'
filename_spectra = 'data/IMPROVE_2015_raw_spectra_cs433.csv'
filename_tts = 'data/IMPROVE_2015_train_test_split_cs433.csv'
# filename_sec_deriv = 'data/IMPROVE_2015_2nd-derivative_spectra_cs433.csv'

df_measures = pd.read_csv(filename_measures)
df_spectra = pd.read_csv(filename_spectra)
df_train_test_split = pd.read_csv(filename_tts)
# df_second_derivative = pd.read_csv(filename_sec_deriv, index_col=0)

In [21]:
measures_useful_cols = ['site','SiteCode','Date','flag','Latitude','Longitude','DUSTf:Value','DUSTf:Unc']

## Exploration

In [24]:
df_measures['DUSTf:Value'].isnull().values.sum()

7

There are 7 NaN values in the dust values. We remove them.

In [30]:
nan_indices = df_measures['DUSTf:Value'].index[df_measures['DUSTf:Value'].apply(np.isnan)]

In [33]:
df_measures = df_measures.drop(df_measures.index[nan_indices])
df_spectra = df_spectra.drop(df_spectra.index[nan_indices])

## Dataframes merging

In [38]:
df_measures[measures_useful_cols]

Unnamed: 0,site,SiteCode,Date,flag,Latitude,Longitude,DUSTf:Value,DUSTf:Unc
0,CRLA1_04_21_2015_NM_0_csv,CRLA1,20150421,NM,42.89580,-122.13610,8.725530,0.758188
1,TONT1_05_09_2015_NM_0_csv,TONT1,20150509,NM,33.65480,-111.10680,4.686666,0.445060
2,BADL1_11_26_2015_NM_0_csv,BADL1,20151126,NM,43.74350,-101.94120,1.007311,0.121622
3,BRID1_10_27_2015_NM_0_csv,BRID1,20151027,NM,42.97490,-109.75790,0.337955,0.107560
4,SEQU1_04_03_2015_NM_0_csv,SEQU1,20150403,NM,36.48940,-118.82910,8.532945,0.735361
5,PINN1_08_04_2015_NM_QC_0_csv,PINN1,20150804,NM,36.48330,-121.15680,1.418231,0.169362
6,PINN1_08_04_2015_NM_0_csv,PINN1,20150804,NM,36.48330,-121.15680,1.418231,0.169362
7,BRIS1_12_17_2015_NM_0_csv,BRIS1,20151217,NM,30.10863,-89.76168,1.297668,0.138210
8,BRCA1_04_15_2015_NM_0_csv,BRCA1,20150415,NM,37.61840,-112.17360,18.571365,1.426632
9,WHIT1_12_14_2015_NM_0_csv,WHIT1,20151214,NM,33.46870,-105.53490,2.865876,0.259473


In [55]:
df_spectra.loc['YOSEX_07_23_2015_NM_0_csv']

0       0.456300
1       0.456100
2       0.455890
3       0.455680
4       0.455460
5       0.455240
6       0.455030
7       0.454820
8       0.454610
9       0.454400
10      0.454200
11      0.453990
12      0.453790
13      0.453590
14      0.453370
15      0.453160
16      0.452940
17      0.452730
18      0.452510
19      0.452290
20      0.452080
21      0.451870
22      0.451670
23      0.451460
24      0.451240
25      0.451030
26      0.450820
27      0.450610
28      0.450390
29      0.450180
          ...   
2754   -0.081014
2755   -0.082169
2756   -0.083278
2757   -0.084338
2758   -0.085344
2759   -0.086289
2760   -0.087171
2761   -0.087994
2762   -0.088767
2763   -0.089482
2764   -0.090129
2765   -0.090705
2766   -0.091219
2767   -0.091679
2768   -0.092088
2769   -0.092448
2770   -0.092750
2771   -0.092987
2772   -0.093164
2773   -0.093276
2774   -0.093322
2775   -0.093300
2776   -0.093208
2777   -0.093020
2778   -0.092702
2779   -0.092248
2780   -0.091657
2781   -0.0909

## Features selection

In [34]:
from sklearn.feature_selection import chi2, SelectKBest

In [35]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(df_spectra, df_measures['DUSTf:Value'])


ValueError: Found input variables with inconsistent numbers of samples: [19932, 18001]

In [22]:
pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
pf.fit_transform(df_spectra)

Index(['wavenumber', 'ACAD1_01_06_2015_NM_0_csv', 'ACAD1_01_09_2015_NM_0_csv',
       'ACAD1_01_12_2015_NM_0_csv', 'ACAD1_01_15_2015_NM_0_csv',
       'ACAD1_01_18_2015_NM_0_csv', 'ACAD1_01_21_2015_NM_0_csv',
       'ACAD1_01_24_2015_NM_0_csv', 'ACAD1_01_27_2015_NM_0_csv',
       'ACAD1_01_30_2015_NM_0_csv',
       ...
       'YOSE1_07_02_2015_FB_0_csv', 'YOSEX_01_15_2015_FB_0_csv',
       'YOSEX_02_05_2015_FB_0_csv', 'YOSEX_05_21_2015_FB_0_csv',
       'YOSEX_06_11_2015_FB_0_csv', 'YOSEX_09_03_2015_FB_0_csv',
       'YOSEX_10_15_2015_FB_0_csv', 'YOSEX_11_05_2015_FB_0_csv',
       'ZICA1_08_13_2015_FB_0_csv', 'ZICA1_09_03_2015_FB_0_csv'],
      dtype='object', length=19939)

In [23]:
# df_spectra[df_spectra.index == [df_train_test_split[df_train_test_split.usage == "calibration"].site]]

ValueError: Arrays were different lengths: 19939 vs 1