In [23]:
# import statements
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import sklearn
import glob
import random
from sklearn import decomposition

def train_test_splitting(xarray_file):
    """
    Split model data into train and testing. 10 ensemble 
    members for each model are used for training as to 
    not over weight a given model. All remaining ensemble 
    members are used for testing.
    """
    # Find number of ensembles and random indicies of testing versus training ensembels
    Nensembles = len(xarray_file.ensemble_member)
    ensemble_train_indices = random.sample(range(0,Nensembles),9)
    ensemble_test_indices = list(set(list(range(0,Nensembles))).difference(ensemble_train_indices))

    # Select natural and forced trends as well as the training and testing data
    trend_data = xarray_file.to_array()[0]
    NatTrendsTrain = trend_data[ensemble_train_indices,0].to_numpy()
    NatTrendsTest = trend_data[ensemble_test_indices,0].to_numpy()
    ForTrendsTrain = trend_data[ensemble_train_indices,1].to_numpy()
    ForTrendsTest = trend_data[ensemble_test_indices,1].to_numpy()

    return(NatTrendsTrain, NatTrendsTest, ForTrendsTrain, ForTrendsTest)

def model_ensemble_reshaper(trends):
    """
    Takes a given models testing or trainging data and 
    reshapes it so that timeperiods from different ensembles
    of a given model are treated equally.
    """
    reshaped_trends = np.reshape(trends, (np.shape(trends)[0]*np.shape(trends)[1], 72,144))
    return(reshaped_trends)

def predictor_reshaper(trends):
    """
    Takes maps of trends and reshapes grid points into a vector.
    """
    PredictorVector = np.reshape(trends, (np.shape(trends)[0], np.shape(trends)[1]*np.shape(trends)[2]))
    return(PredictorVector)

def tropical_mean_trend(trends):
    """ 
    Takes map of trends and finds average over the 30S-30N region.
    """
    ReshapedTrends = np.reshape(trends[:,24:48,:], (np.shape(trends)[0],24*144))
    TropicalAverageTrend = np.average(ReshapedTrends, axis=1)
    return(TropicalAverageTrend)

def eof_and_pc_finder(mat_structure_X_sample):
    """
    Question for Mu-Ting: Should we be using eofs or the corrected amplitude eofs??
    """
    u, s, vh = np.linalg.svd(mat_structure_X_sample, full_matrices=True)
    eofs = u.T
    smat = np.zeros((10368, 270), dtype=complex)
    smat[:270, :270] = np.diag(s)

    #get eigenvalues
    eigenvalues = (np.multiply(s,s))/len(s)
    eigenvalues_as_frac_of_cov = eigenvalues/(np.sum(eigenvalues))

    #create eofs with correct amplitude
    #D = np.matmul(u, smat)/(np.sqrt(270))
    #corrected_amplitude_eof = D.T
    #find Principal Components
    pcs = np.matmul(smat, vh).real

    return(eofs, pcs, eigenvalues_as_frac_of_cov)


path_to_data = '/home/disk/pna2/aodhan/SurfaceTrendLearning/*.nc'
ModelDataFiles = glob.glob(path_to_data)

TraingPredictorData = []
TrainingTargetData = []
TestingPredictorData = []
TestingTargetData = []
for datafile in ModelDataFiles:
    xarray_file = xr.open_dataset(datafile) 
    # find training and testing data for natural and forced trends
    NatTrendsTrain, NatTrendsTest, ForTrendsTrain, ForTrendsTest = train_test_splitting(xarray_file)

    # reshape trends so that trend maps from different time periods and ensembles are treated equal
    NatTrendsTrain = model_ensemble_reshaper(NatTrendsTrain)
    NatTrendsTest = model_ensemble_reshaper(NatTrendsTest)
    ForTrendsTrain = model_ensemble_reshaper(ForTrendsTrain)
    ForTrendsTest = model_ensemble_reshaper(ForTrendsTest)

    # weight trend maps by cosine of latitude
    weights = np.cos(np.deg2rad(xarray_file.Lat.to_numpy())) # these will be used to weight predictors
    NatTrendsTrain_weighted = np.multiply(NatTrendsTrain, weights[np.newaxis,:,np.newaxis])
    NatTrendsTest_weighted = np.multiply(NatTrendsTest, weights[np.newaxis,:,np.newaxis])
    ForTrendsTrain_weighted = np.multiply(ForTrendsTrain, weights[np.newaxis,:,np.newaxis])
    ForTrendsTest_weighted = np.multiply(ForTrendsTest, weights[np.newaxis,:,np.newaxis])
    
    # true trend maps are sum of natural and forced trends
    # find true trends for training data
    TrueTrendsTrain = NatTrendsTrain_weighted + ForTrendsTrain_weighted
    TrainShape = np.shape(TrueTrendsTrain)
    TrueTrendsTrain_sample_X_structure = np.reshape(TrueTrendsTrain, (TrainShape[0], TrainShape[1]*TrainShape[2]))
    TrueTrendsTrain_structure_X_sample = TrueTrendsTrain_sample_X_structure.T

    # find true trends for testing data
    TrueTrendsTest = NatTrendsTest_weighted + ForTrendsTest_weighted
    TestShape = np.shape(TrueTrendsTest)
    TrueTrendsTest_sample_X_structure = np.reshape(TrueTrendsTest, (TestShape[0], TestShape[1]*TestShape[2]))
    TrueTrendsTest_structure_X_sample = TrueTrendsTest_sample_X_structure.T

    # Get the principle components from the training data
    eofs, pcs, eigenvalues = eof_and_pc_finder(TrueTrendsTrain_structure_X_sample)
    N_important = 35 # choosen to match the amount of PCs needed to represent 80% of variability

    # ImportantPrincipleComponents is a vector with 35 coefficients of eofs (pcs) for each of the 270 training trends
    ImportantPrincipleComponents = pcs[:N_important] 
    ImportantEOFS = eofs[:N_important]
    # project maps onto the eofs
    ImportantEOFS = eofs[:N_important]
    TrainingFeatures = ImportantEOFS@TrueTrendsTrain_structure_X_sample
    TestingFeatures = ImportantEOFS@TrueTrendsTest_structure_X_sample

    print(np.shape(TrainingFeatures), np.shape(TestingFeatures))

    # reshape predictors as vector
    TrainingTrends_vectors = TrainingFeatures.T#predictor_reshaper(TrueTrendsTrain)
    TestingTrends_vectors = TestingFeatures.T#predictor_reshaper(TrueTrendsTest)

    # find tropical mean trend value
    NatTrendsTrainTropicalMean = tropical_mean_trend(NatTrendsTrain_weighted)
    NatTrendsTestTropicalMean = tropical_mean_trend(NatTrendsTest_weighted)
    ForTrendsTrainTropicalMean = tropical_mean_trend(ForTrendsTrain_weighted)
    ForTrendsTestTropicalMean = tropical_mean_trend(ForTrendsTest_weighted)

    [TraingPredictorData.append(TrainingTrends_vectors[i]) for i in range(len(TrainingTrends_vectors))]
    [TrainingTargetData.append([NatTrendsTrainTropicalMean[i], ForTrendsTrainTropicalMean[i]]) 
    for i in range(len(ForTrendsTrainTropicalMean))]
    [TestingPredictorData.append(TestingTrends_vectors[i]) for i in range(len(TestingTrends_vectors))]
    [TestingTargetData.append([NatTrendsTestTropicalMean[i], ForTrendsTestTropicalMean[i]]) 
    for i in range(len(ForTrendsTestTropicalMean))]

TraingPredictorData = np.array(TraingPredictorData)
TrainingTargetData = np.array(TrainingTargetData)
TestingPredictorData = np.array(TestingPredictorData)
TestingTargetData = np.array(TestingTargetData)

(35, 270) (35, 930)
(35, 270) (35, 60)
(35, 270) (35, 60)


KeyboardInterrupt: 

## It may make sense to find PCs for just the natural variability, and then traiin the model on that