In [None]:
import calendar
from functools import lru_cache
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pkg_resources
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import sobol_seq
import types

### Package requirements for reproducibility

In [None]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

#### Define the initial dataset you'll be working on

In [None]:
SatelliteJuly = pd.read_excel('Data.xlsx',sheet_name='July_sat').drop([r for r in range(12,24)])
SatelliteJune = pd.read_excel('Data.xlsx',sheet_name='June_sat').drop([r for r in range(12,24)])
SatelliteMay = pd.read_excel('Data.xlsx',sheet_name='May_sat').drop([r for r in range(12,24)])
SatelliteApril = pd.read_excel('Data.xlsx',sheet_name='April_satellite').drop([r for r in range(12,
                                                                                                24)])
Dough = pd.read_excel('Data.xlsx',sheet_name='Dough',usecols=[2,3]).drop([r for r in range(12,24)])

In [None]:
SD = [SatelliteApril,SatelliteMay,SatelliteJune,SatelliteJuly]

SatelliteData = pd.concat(SD,axis=1,sort=False).dropna(axis=1)
SatelliteData.index = SatelliteData.index+1

In [None]:
DoughDistributions = pd.read_excel('alvrografosensit.xls',header=1,index_col=0,usecols=[0,1,2,3,4]).drop([r for r in range(13,
                                                                                                25)])

### Generate the (quasi)-random sample out of the distribution of features measured

In [None]:
seed = pd.DataFrame(sobol_seq.i4_sobol_generate(6,1000))

In [None]:
P_dist = pd.concat([seed[0]*pm for pm in (DoughDistributions.groupby('campione').P.max()-
                                           DoughDistributions.groupby('campione').P.min())]).astype(int)
L_dist = pd.concat([seed[1]*pm for pm in (DoughDistributions.groupby('campione').L.max()-
                                           DoughDistributions.groupby('campione').L.min())]).astype(int)
W_dist = pd.concat([seed[2]*pm for pm in (DoughDistributions.groupby('campione').P.max()-
                                           DoughDistributions.groupby('campione').P.min())]).astype(int)
Features_dist = pd.concat([P_dist,L_dist,W_dist],axis=1)
rng = [dd for dd in range(1,13) for se in range(len(seed))]
rng.extend([dd for dd in range(25,49) for se in range(len(seed))])

Features_dist.index = rng
Features_dist = Features_dist.rename(columns={0:'P',1:'L',2:'W'})
Features_dist = Features_dist + DoughDistributions[['P','L','W']].groupby('campione').min()
Features_dist['P_L'] = Features_dist.P/Features_dist.L
Fd = Features_dist.iloc[:,-2:]

SatelliteData=SatelliteData.reindex(Fd.index)

### Perform the random-forest regression

In [None]:
regr = RandomForestRegressor(n_estimators=500,n_jobs=-1)

In [None]:
l_predictions = []
for row in np.unique(SatelliteData.index):
    regr.fit(SatelliteData.drop(row),Fd.drop(row))
    
    l_predictions.append(regr.predict(SatelliteData.loc[row].iloc[0].values.reshape(1,-1))[0])
    
df_predictions = pd.DataFrame([l for l in l_predictions],columns=Fd.columns,index=np.unique(SatelliteData.index))    

distance = Fd - df_predictions

In [112]:
distance_w = pd.concat([g.reset_index(drop=True) for index,g in distance.groupby(by=distance.index).W],axis=1)
distance_pl = pd.concat([g.reset_index(drop=True) for index,g in distance.groupby(by=distance.index).P_L],axis=1)
distance_w.columns = np.unique(SatelliteData.index)
distance_pl.columns = np.unique(SatelliteData.index)

In [120]:
plt.figure(figsize=(20,10))
distance_w.boxplot()
plt.title('W_distance_across_samples')
plt.savefig('W_distance_across_samples.png')
plt.close()

plt.figure(figsize=(20,10))
distance_pl.boxplot()
plt.title('P/L_distance_across_samples')
plt.savefig('P_L_distance_across_samples.png')
plt.close()