In [2]:
import calendar
from functools import lru_cache
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pkg_resources
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import sobol_seq
import types

### Package requirements for reproducibility

In [3]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

scikit-learn==0.21.2
pandas==0.23.4
numpy==1.11.3
matplotlib==2.2.2


#### Define the initial dataset you'll be working on

In [4]:
SatelliteJuly = pd.read_excel('Data.xlsx',sheet_name='July_sat').drop([r for r in range(12,24)]).reset_index(drop=True)
SatelliteJune = pd.read_excel('Data.xlsx',sheet_name='June_sat').drop([r for r in range(12,24)]).reset_index(drop=True)
SatelliteMay = pd.read_excel('Data.xlsx',sheet_name='May_sat').drop([r for r in range(12,24)]).reset_index(drop=True)
SatelliteApril = pd.read_excel('Data.xlsx',sheet_name='April_satellite').drop([r for r in range(12,
                                                                                                24)]).reset_index(drop=True)
Seed = pd.read_excel('Data.xlsx',sheet_name='Chem_comp_wheat',usecols=[1,2,3,4,5,6,7,8]).drop([r for r in range(12,
                                                                                                24)]).reset_index(drop=True)
Dough = pd.read_excel('Data.xlsx',sheet_name='Dough',usecols=[2,3]).drop([r for r in range(12,24)]).reset_index(drop=True)
Bread = pd.read_excel('Data.xlsx',sheet_name='Bread',usecols=[2,3,4,5,6,9]).drop([r for r in range(12,24)]).reset_index(drop=True)
Seed2 = Seed.copy()
Seed2[['Prot sol acq','Prot nacl','Prot etoh', 'Prot ac ac']] = \
Seed[['Prot sol acq','Prot nacl','Prot etoh', 'Prot ac ac']].apply(lambda x: Seed.T.iloc[3].T * x/100)

In [5]:
SD = [SatelliteApril,SatelliteMay,SatelliteJune,SatelliteJuly]

SatelliteData = pd.concat(SD,axis=1,sort=False).dropna(axis=1)

In [6]:
names = ['Seed','Dough','Bread']
db = [Seed,Dough,Bread]
Fc = [(names[idb],c) for idb,d in enumerate(db) for c in d ]
Features = pd.concat([Seed,Dough,Bread],axis=1)
Features.columns = pd.MultiIndex.from_tuples(Fc,names=('Stage','Feature'))

### Perform the Random-Forest regression

In [None]:
seed = (sobol_seq.i4_sobol_generate(2,1000)*np.array([len(SatelliteData.columns),len(SatelliteData)])).astype(int)

In [None]:
regr = RandomForestRegressor(n_estimators=500,n_jobs=-1)

In [None]:
l_features = []
R2_list = []
l_predictions = []

for s in seed:
    regr.fit(SatelliteData.drop(SatelliteData.columns[s[0]],axis=1).drop(SatelliteData.index[s[1]],axis=0),
             Features.drop(Features.index[s[1]],axis=0))

    l_features.append(pd.Series(regr.feature_importances_,
                                index=SatelliteData.columns.drop(SatelliteData.columns[s[0]])))
    
    R2_list.append(pd.Series(regr.score(SatelliteData.drop(SatelliteData.columns[s[0]],axis=1).drop(SatelliteData.index[s[1]],
            axis=0),Features.drop(Features.index[s[1]],axis=0)),index=[s[1]]))
    
    l_predictions.append(pd.DataFrame(regr.predict(SatelliteData.drop(SatelliteData.columns[s[0]],
                                                axis=1).iloc[s[1]].values.reshape(-1,1).T),index=[s[1]]))
    
df_features = pd.concat(l_features,axis=1,sort=True).T

df_predictions = pd.concat(l_predictions).sort_index()
df_predictions.columns = Features.columns

R2_df = pd.concat(R2_list).sort_index()
R2_metrics = R2_df.groupby(R2_df.index).mean().round(2)

Err = np.abs(df_predictions.groupby(df_predictions.index).mean()-Features)/Features
       
Err2 = Err.applymap(lambda x: "{0:.1f}%".format(x*100))

In [47]:
with open('mytable.tex', 'w') as tf:
     tf.write(Err.round(2).to_latex())

### Increase the number of trees to 1_000

In [None]:
regr2 = RandomForestRegressor(n_estimators=1_000,n_jobs=-1)

l_features = []
R2_list = []
l_predictions = []

for s in seed:
    regr2.fit(SatelliteData.drop(SatelliteData.columns[s[0]],axis=1).drop(SatelliteData.index[s[1]],axis=0),
             Features.drop(Features.index[s[1]],axis=0))

    l_features.append(pd.Series(regr2.feature_importances_,
                                index=SatelliteData.columns.drop(SatelliteData.columns[s[0]])))
    
    R2_list.append(pd.Series(regr2.score(SatelliteData.drop(SatelliteData.columns[s[0]],axis=1).drop(SatelliteData.index[s[1]],
            axis=0),Features.drop(Features.index[s[1]],axis=0)),index=[s[1]]))
    
    l_predictions.append(pd.DataFrame(regr2.predict(SatelliteData.drop(SatelliteData.columns[s[0]],
                                                axis=1).iloc[s[1]].values.reshape(-1,1).T),index=[s[1]]))
    
df_features = pd.concat(l_features,axis=1,sort=True).T

df_predictions = pd.concat(l_predictions).sort_index()
df_predictions.columns = Features.columns

R2_df = pd.concat(R2_list).sort_index()
R2_metrics = R2_df.groupby(R2_df.index).mean().round(2)

Err = np.abs(df_predictions.groupby(df_predictions.index).mean()-Features)/Features
       
Err2 = Err.applymap(lambda x: "{0:.1f}%".format(x*100))