In [None]:
import pandas as pd
import os

#Read data (second Excel sheet)
path = '#input file path#'
dateLimit = '2021-10-01'

# For Excel workbook/xlsx file
df = pd.ExcelFile(path)
df_sheet1 = pd.read_excel(df, 'Sheet1')

print('Unique investment (Original): ',df_sheet1['InvestmentCode'].nunique())
print('Shape of the dataset (Original): ',df_sheet1.shape)

In [None]:
# Remove investment that has never reached 'Physical Completion' in MilestoneName (in at least row).

df_WithMilestone = df_sheet1.groupby('InvestmentCode').filter(lambda x: x['MilestoneName'].notnull().any())

print('Unique investment (Keep the ones with Milestones): ',df_WithMilestone['InvestmentCode'].nunique())
print('Shape of the dataset (Keep the ones with Milestones): ',df_WithMilestone.shape)

In [None]:
# Remove investment that they have a milestone date on or after october 2021.
# The order for the date check is (1)ActualDate, (2)ForecastDate, (3)TargetDate
pd.set_option('mode.chained_assignment', None)

def CompletedMilestone (df):
    df['date'] = df['Milestone_Core_ActualDate']
    for i in df.index:    
        if pd.isnull(df['date'][i]):
            df['date'][i] = df['Milestone_Core_ForecastDate'][i]
        if pd.isnull(df['date'][i]):
            df['date'][i] = df['Milestone_Core_TargetDate'][i]
    return df[df['date'] < dateLimit]

df_RowsWithValidMilestones = CompletedMilestone(df_WithMilestone)
df_InvestmentCodeWithValidMilestones = df_RowsWithValidMilestones.loc[:,'InvestmentCode']
df_NoFutureMilestoneWithDateColumn = df_WithMilestone[df_WithMilestone['InvestmentCode'].isin(df_InvestmentCodeWithValidMilestones)]
df_NoFutureMilestone = df_NoFutureMilestoneWithDateColumn.drop(columns=['date'])

del df_RowsWithValidMilestones, df_InvestmentCodeWithValidMilestones, df_NoFutureMilestoneWithDateColumn

#print(df_NoFutureMilestone)
print('Unique investment (Keep the ones that do not have milestone in the future): ',df_NoFutureMilestone['InvestmentCode'].nunique())
print('Shape of the dataset (Keep the ones that do not have milestone in the future): ',df_NoFutureMilestone.shape)

In [None]:
# Remove single investment rows that 
# - do not have any forecast AND actuals 
# - have scenario forecast in the future 
# do not need to do a groupby per inestmentCode


def CompletedScenarioLastMonth(df):
    df[~(df['Milestone_Core_ForecastDate'].isnull() & df['Milestone_Core_ActualDate'].isnull()) ]
    return df[(df['Scenario__LastMonth'] <= "2021-10-01")]   

df_NoFutureForecast = CompletedScenarioLastMonth(df_NoFutureMilestone)
print(df_NoFutureForecast)

print('Unique investment (Keep the ones that do not have forecast in the future): ',df_NoFutureForecast['InvestmentCode'].nunique())
print('Shape of the dataset (Keep the ones that do not have forecast in the future): ',df_NoFutureForecast.shape)

In [None]:

# Output the dataset after Step 3 Version 2
operation = '_no Physical Completition2.xlsx'
root_ext = os.path.splitext(path)
outfile = root_ext[0] + operation
df_NoFutureForecast.to_excel(outfile, index=False, encoding='utf-8')