In [None]:
import calendar
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pkg_resources
from scipy.stats import linregress
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import types

### Package requirements for reproducibility

In [None]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

#### Define the initial dataset you'll be working on

In [None]:
SatelliteJuly = pd.read_excel('Data.xlsx',sheet_name='July_sat')
SatelliteJune = pd.read_excel('Data.xlsx',sheet_name='June_sat')
SatelliteMay = pd.read_excel('Data.xlsx',sheet_name='May_sat')
SatelliteApril = pd.read_excel('Data.xlsx',sheet_name='April_satellite')

SD = [SatelliteApril,SatelliteMay,SatelliteJune,SatelliteJuly]

SatelliteData = pd.concat(SD,axis=1,sort=False)

Dough = pd.read_excel('Data.xlsx',sheet_name='Dough',usecols=['W','P/L'])

### Let's build a first regression between reflectance data and the features we are examining

In [None]:
bl1 = []
bli = []
for ib in SatelliteData:
    bl = []
    bli.append((calendar.month_abbr[int(ib[-1])],ib[:4]))
    for c in Dough:
        bl.append(linregress(SatelliteData[ib],Dough[c])[2]**2)
    bl1.append(bl)
Regression_df = pd.concat([pd.DataFrame(b1) for b1 in bl1],axis=1).T
Regression_df.index = pd.MultiIndex.from_tuples(bli,names=('Month','Index'))
Regression_df.columns = Dough.columns

### Let us evaluate if one can produce a better inference on the principal components of the indices

In [None]:
scaler = StandardScaler()

explained_variance = [.95,.99,.995,.999]

PCA_v = []

PCA_f = []  

for ev in explained_variance:

    PCA_l = []
    
      

    for sd in SD:

        sd=sd.dropna(axis='columns')

        scaler.fit(sd)
        scaledSatellite = pd.DataFrame(scaler.transform(sd))

        pca = PCA(ev)
        pca.fit(scaledSatellite)
        PCA_Satellite = pd.DataFrame(pca.transform(scaledSatellite))
        PCA_Satellite = PCA_Satellite.add_suffix(str(sd.columns[0][-7:])) 
        PCA_l.append(PCA_Satellite)
        
        PCA_stat = pd.DataFrame(pca.components_)
        PCA_stat['Explained_variance'] = pca.explained_variance_ratio_
        PCA_stat['month'] = calendar.month_abbr[int(sd.columns[0][-1])]
        PCA_stat['explained_variance'] = ev
        PCA_f.append(PCA_stat)
        
    PCA_df = pd.concat(PCA_l,axis=1,sort=False)

    bl1 = []
    bli = []
    for ib in PCA_df:
        bl = []
        bli.append((calendar.month_abbr[int(ib[-1])],'PCA'+str(ib[0])))
        for c in Dough:
            bl.append(linregress(PCA_df[ib],Dough[c])[2]**2)
        bl1.append(bl)
    Regression_PCA = pd.concat([pd.DataFrame(b1) for b1 in bl1],axis=1).T
    Regression_PCA.index = pd.MultiIndex.from_tuples(bli,names=('Month','Index'))
    Regression_PCA.columns = Dough.columns
    Regression_PCA['explained_variance']=ev
    Regression_PCA.set_index('explained_variance', append=True, inplace=True)
    PCA_v.append(Regression_PCA)
    
PCA_variance = pd.concat(PCA_v).sort_index(level=0)

PCA_components = pd.concat(PCA_f).set_index(['month','explained_variance'])
PCA_components = PCA_components.sort_index(level=0)
PCA_components = PCA_components[[c for c in range(7)]+['Explained_variance']]

### One can isolate the most promising components on the basis of the $R^2$ value

In [None]:
PCA_variance

### And finally draw the resulting scatter plots

In [None]:
Plots = PCA_df[['1_Mese_5','1_Mese_6']]

In [None]:
for c in Dough:
    for c1 in Plots:
        plt.scatter(Plots[c1],Dough[c],s=1,label='$R^2$ ='+str(np.round_(linregress(Plots[c1],Dough[c])[2]**2,2)))
        plt.xlabel('Principal_Component_value')
        plt.ylabel('Deformation energy, W $10^{-4}$ J' if c == 'W' else 'Curve configuration ratio, P/L')
        plt.title(calendar.month_abbr[int(c1[-1])])
        plt.legend()
        plt.show()

In [None]:
PCA_components

In [None]:
SatelliteJune