In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sys

In [None]:
investmentStages = ["Initial", "Long Term Planning", "Short Term Planning", "Executing", "Completed"]
trainingColumns = ['Feature-Duration','Feature-Total','Label-Duration','Label-Total']
testingColumns = ['Feature-Duration','Feature-Total']
keywordTraining = 'training'
keywordDuration = '-Duration'
keywordCost = '-Total'

In [None]:
# 1. Please first clean the dataset using the script called 'Portfolio Timeline Testing - 
# Data Cleaning' or 'Portfolio Timeline Training - Data Cleaning', based on the dataset type.
# 2. Then use the 'Create Cost and Duration columns' script to prepare the dataset used as following.

# Import the prepared datasets after completing the two steps above.
trainingDataSet = "Training Data"
inputNamesAndFiles = {
    trainingDataSet :  r'#input file path#',
    "Testing Data 2019" :  r'#input file path#',
    "Testing Data 2020" :  r'#input file path#',
    "Testing Data 2021" :  r'#input file path#'
}

# Load Data into a dictionary
data = {}
for inputNameAndFile in inputNamesAndFiles:
    excelFile = pd.ExcelFile(inputNamesAndFiles[inputNameAndFile])
    data[inputNameAndFile] = pd.read_excel(excelFile, 'Sheet1')
    
#print(data)

In [None]:
# Remove redundant rows for the same investment by only keeping the latest row
for inputData in data:
    data[inputData] = data[inputData].groupby(['InvestmentCode'], as_index=False).apply(lambda i:i.iloc[-1])

In [None]:
# Statistical analysis for the features and label columns of the training dataset; 
# for the features columns of the testing dataset

# 1. Analysis for the BPA training datasets

for inputData in data :
    print("=========== Data: " + inputData + " ===========")
    if keywordTraining in inputData.lower() :
        print(data[inputData][trainingColumns].describe())
    else :
        print(data[inputData][testingColumns].describe())
    print()

In [None]:
# Split data per investment stage
dataPerStage = {}
for inputData in data :
    dataPerStage[inputData] = {}
    for stage in investmentStages :
        dataPerStage[inputData][stage] = data[inputData][data[inputData]['Investment_Core_InvestmentStage'] == stage]

In [None]:
# Data analysis for each stage in each dataset
for inputData in dataPerStage : # first loop to go over Training / Testing 2019 / Testing 2020 / Testing 2021 datasets
    for stage in investmentStages : # second loop to go over investment stages
        print("=========== Data: " + inputData + ", Stage: " + stage + " ===========")
        if keywordTraining in inputData.lower() :
            print(dataPerStage[inputData][stage][trainingColumns].describe())
        else :
            print(dataPerStage[inputData][stage][testingColumns].describe())
        print()

In [None]:
def DurationOrCost(displayDuration, dataPerStage):
    
    # 1. Set postfixKeyword for selecting either Duration or Cost
    postfixKeyword = ''
    if displayDuration is True:
        postfixKeyword = keywordDuration
    else :
        postfixKeyword = keywordCost
    
    # 2. Determine Y axe min,max range
    computedYLimit = (sys.float_info.max, sys.float_info.min)
    for inputData in data :
        computedYLimitForData = ( data[inputData]['Feature' + postfixKeyword].min(), 
                                  data[inputData]['Feature' + postfixKeyword].max())
        # Only the training data set has label values
        if 'Label' + postfixKeyword in data[inputData].columns :
            computedYLimitForData = (min(computedYLimitForData[0], data[inputData]['Label' + postfixKeyword].min()),
                                     max(computedYLimitForData[1], data[inputData]['Label' + postfixKeyword].max()))
        computedYLimit = (min (computedYLimit[0], computedYLimitForData[0]),
                          max (computedYLimit[1], computedYLimitForData[1]))
        # Adjust the limit that excludes most outliers
        if computedYLimit[1]>0.25*10**8:
            computedYLimit = (computedYLimit[0],0.25*10**8)  
        
        
    #3. Initialize plot area
    fig, axs = plt.subplots(5,1,figsize=(10,20))
    plt.setp(axs, ylim=computedYLimit)
    
    # 3a. Create Box Plot for Label
    dataForBoxPlot = {}
    for stage in investmentStages :
        dataForBoxPlot[stage] = dataPerStage[trainingDataSet][stage]['Label' + postfixKeyword]
    dataForBoxPlot['Overall'] = data[trainingDataSet]['Label' + postfixKeyword]
    dataFrameforBoxPlot = pd.DataFrame(data=dataForBoxPlot)
    dataFrameforBoxPlot.boxplot(ax=axs[0])
    axs[0].set_title(trainingDataSet + '-Label' + postfixKeyword)
    

    
    # 3b. Create Box Plot for Features
    index = 1
    for inputData in dataPerStage : # first loop to go over Training / Testing 2019 / Testing 2020 / Testing 2021 datasets
        for stage in investmentStages : # second loop to go over investment stages, only looking at Features columns
            dataForBoxPlot[stage] = dataPerStage[inputData][stage]['Feature' + postfixKeyword]
        dataForBoxPlot['Overall'] = data[inputData]['Feature' + postfixKeyword]
        
        
        dataFrameforBoxPlot = pd.DataFrame(data=dataForBoxPlot)
        dataFrameforBoxPlot.boxplot(ax=axs[index])
        axs[index].set_title(inputData +'-Feature' + postfixKeyword)
        index = index + 1
    
    # Generate PDF files for the charts
    filename = postfixKeyword[1:] +'.pdf'
    plt.savefig(filename)


In [None]:
# If you're interested in viewing the duration of the datasets, please type True for 'displayDuration'.
# If you're interested in viewing the cost of the datasets, please type False for 'displayDuration'.
displayDuration = True
    
DurationOrCost(displayDuration, dataPerStage)

In [None]:
interests = ['Scenario__FirstMonth', 'Scenario__LastMonth', 'Actuals_FirstMonth', 'Actuals_LastMonth',\
             'Feature-Duration', 'Label-Duration', 'InvestmentCode', 'Investment_Core_InvestmentStage']
d = data['Training Data'][interests]

print('Outlier Analysis for Feature Duration')
print(d[d['Feature-Duration']>200].describe(include='all',datetime_is_numeric=True))

print('Analysis for Feature Duration')
print(d.describe(include='all',datetime_is_numeric=True))