In [None]:
from RDSDatabaseConnector import RDSDatabaseConnector
from Plotter import Plotter
from DataTransform import DataTransform
from DataFrameTransform import DataFrameTransform
from DataFrameInfo import DataFrameInfo
from functions import display_suggested_drops, column_drop
import pandas as pd
import numpy as np

from datetime import datetime

RDSD = RDSDatabaseConnector()
PL = Plotter()
DT = DataTransform()
DFT = DataFrameTransform()
DFI = DataFrameInfo()

print("BEGINNING OF PROGRAM\n")
print("ASSESSING")
datacsv_df = pd.read_csv('data.csv') #define dataframe
current_all_columns = list(datacsv_df.columns) #define list of all columns in dataframe

#columns to convert to the various formats:
list_of_to_categorical = ['term','grade','sub_grade', 'employment_length','home_ownership','verification_status','loan_status','payment_plan','purpose','delinq_2yrs','application_type']
list_of_to_boolean = ['policy_code']
list_of_to_float = []
list_of_to_int = []
date_columns = ['issue_date','earliest_credit_line','last_payment_date','next_payment_date','last_credit_pull_date']
acceptable_null_percentage: float = 80.0
print("ASSESSING COMPLETE")

def initial_general_cleanup(dataframe: pd.core.frame.DataFrame, acceptable_null_percentage):
    """Perform cleanup, return dataframe with converted dtypes, no null values.
    Specific cleanup processes to
    be listed here in detail:
    
    Perform Conversions - dates, categorical, boolean, float, int.
    Drop Null columns under X% full of rows.
    Impute columns with data.
    
    args:
    dataframe (type: pd.core.frame.DataFrame)
    acceptable_null_percentage (type: float)
    
    """
    #Column Conversions:
    dataframe = DT.convert_dates_to_proper_format(dataframe, date_columns, format = '%b-%Y') # %b is Jan/Feb/Mar etc
    dataframe = DT.to_categorical(dataframe, list_of_to_categorical)
    dataframe = DT.to_boolean(dataframe, list_of_to_boolean)
    dataframe = DT.to_float(dataframe, list_of_to_float)
    dataframe = DT.to_int(dataframe, list_of_to_int)
    
    #Drop Null columns under X%:
    acceptable_null_percentage = display_suggested_drops(dataframe, acceptable_null_percentage)
    dataframe, dropped_columns= column_drop(dataframe, acceptable_null_percentage)
    
    #Impute the rest of the columns:
    dataframe = DFT.impute(dataframe)
    
    return dataframe, dropped_columns

#Perform cleanup, return dataframe with converted dtypes, no null values.
print("\n")
print("Perform cleanup, return dataframe with converted dtypes, no null values.")
clean_dataframe, dropped_columns = initial_general_cleanup(datacsv_df, acceptable_null_percentage)
print("CLEANUP COMPLETE")

#remove dropped columns from current_all_columns variable.
print("\n")
print("Modify current_all_columns to reflect the removal of the previously stated columns from the dataframe")
for i in dropped_columns:
    current_all_columns.remove(i)
print("MODIFICATIONS COMPLETE")
### UP TO HERE YIELDS A DATAFRAME CALLED clean_dataframe.

###OPTIONAL CODE FOR SKEW CORRECTION BEGIN
'''#THE PART WHERE SKEW MAY BE IMPORTANT
#Check for skew in the dataframe
#Visualise skew in the dataframe
#Apply skew correction for each column if (on a per column basis) it is necessary
#PL.skew_correction(clean_dataframe)

#dataframe = apply_skew_correction(dataframe)
'''
###OPTIONAL CODE FOR SKEW CORRECTION END

def remove_columns_for_analysis(dataframe, columns):
    def remove_columns(dataframe, columns):
        clean_dataframe, symbols_removed = DT.excess_symbol_removal(dataframe, columns)
        return clean_dataframe, symbols_removed

    symbols_to_be_removed_from_dataframe = [columns]
    dataframe, symbols_removed = remove_columns(dataframe, symbols_to_be_removed_from_dataframe)
    for i in symbols_removed:
        dropped_columns.append(i)#append symbols to be removed to dropped_columns
    return dataframe

# Remove excess symbols
symbols = ['id', 'member_id', 'term', 'int_rate', 'grade', 'sub_grade', 'employment_length',
           'verification_status', 'issue_date', 'payment_plan', 'purpose', 'dti', 'delinq_2yrs', 'home_ownership',
           'earliest_credit_line', 'inq_last_6mths', 'open_accounts', 'total_accounts', 'last_payment_date',
           'last_credit_pull_date', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'annual_inc']

dataframe_for_MS4_T2_analysis = remove_columns_for_analysis(clean_dataframe, symbols)
print("SYMBOLS REMOVED")
### UP TO HERE YIELDS A DATAFRAME CALLED dataframe_for_MS4_T3_analysis.
print("\n")


def milestone_4_task_2(dataframe: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """Milestone 4 Task 2

    The company wants to check what percentage of loans have been a loss to the company:

    Loans marked as Charged Off in the loan_status column represent a loss to the company.


    Calculate the percentage of charged off loans historically and the total amount that 
    was paid towards these loans before being charged off.
    """
    def calculate_the_percentage_of_charged_off_loans_historically(dataframe: pd.core.frame.DataFrame)-> None:
        no_of_charged_off = []
        for i in dataframe['loan_status']:
            if i == 'Charged Off':
                no_of_charged_off.append(i)
        print("Number of individual loans charged off:\n", len(no_of_charged_off))
        print("Charged off loans as a percentage of all loans:\n", (len(no_of_charged_off)/len(dataframe_for_MS4_T3_analysis))*100, "%")
        return None

    calculate_the_percentage_of_charged_off_loans_historically(dataframe_for_MS4_T2_analysis)

    #the total amount that was paid towards these loans before being charged off.
    #- **total_payment**: Payments received to date for total amount funded
    #- **total_rec_int**: Interest received to date
    #- **total_rec_late_fee**: Late fees received to date

    def create_new_column(dataframe: pd.core.frame.DataFrame, new_column: str)-> pd.core.frame.DataFrame:
        dataframe[f'{new_column}'] = np.nan
        return dataframe
    dataframe_for_MS4_T2_analysis_with_new_column = create_new_column(dataframe_for_MS4_T2_analysis, new_column = 'tot_amount_paid_before_chgd_off')

    def remove_non_charged_off_rows():
        charged_off_df = dataframe_for_MS4_T2_analysis_with_new_column[(dataframe_for_MS4_T2_analysis_with_new_column == 'Charged Off').any(axis=1)]
        return charged_off_df

    charged_off_df = remove_non_charged_off_rows().copy()

    def add_total_amount_paid_before_chgd_off():
        for i in charged_off_df['tot_amount_paid_before_chgd_off']:
            i = (charged_off_df['total_payment'] + charged_off_df['total_rec_int'])
            charged_off_df['tot_amount_paid_before_chgd_off'] = i
            return charged_off_df
    charged_off_df = add_total_amount_paid_before_chgd_off()
    return dataframe, charged_off_df
dataframe_for_MS4_T3_analysis, charged_off_df = milestone_4_task_2(dataframe_for_MS4_T2_analysis)

print("Total Amount Paid Off Before Being Charged Off:\n", charged_off_df['tot_amount_paid_before_chgd_off'].sum())

def milestone_4_task_3_step_1():
    "Calculate the loss in revenue these loans would have generated for the company if they had finished their term."
    def some_function_to_calculate_the_loss_in_revenue_these_loans_would_have_generated():
        charged_off_total = charged_off_df['tot_amount_paid_before_chgd_off'].sum()
        column_which_has_the_total_amount_that_was_loaned = charged_off_df['loan_amount']
        total_loans = column_which_has_the_total_amount_that_was_loaned.sum()
        total_loss = total_loans - charged_off_total
        return total_loss
    total_loss = some_function_to_calculate_the_loss_in_revenue_these_loans_would_have_generated()
    print(f"The loss of revenue due to the loans marked as 'Charged Off':\n {total_loss}")
    return charged_off_df
charged_off_df = milestone_4_task_3_step_1()
'''Calculate the projected loss of the loans marked as Charged Off.
Calculate the loss in revenue these loans would have generated for the company if they had finished their term. 
Visualise the loss projected over the remaining term of these loans.
'''
def milestone_4_task_3_step_2():
    pass
print("\nMILESTONE 4 TASK 3 ENDS HERE\n")


In [None]:
'''#MILESTONE 4 TASK 4
There are customers who are currently behind with their loan payments.
This subset of customers represent a risk to company revenue.


What percentage do users' in this bracket currently represent as a percentage of all loans?
Calculate the total amount of customers in this bracket and how much loss the company would
incur if their status was changed to Charged Off.

What is the projected loss of these loans if the customer were to finish the full loans term?


If customers late on payments converted to Charged Off, what percentage of total expected revenue
do these customers and the customers who have already defaulted on their loan represent?
'''

'''Step1
There are customers who are currently behind with their loan payments.
This subset of customers represent a risk to company revenue.

What percentage do users' in this bracket currently represent as a percentage of all loans?
'''

###
# How do we define programmatically customers who are currently behind with their loan payments?
# - there will likely be a column to represent this
# Identify that column:
# - Column Candidates: 
# I might not be able to use the charged_off_df for
# this, so I must go back and identify which dataframe I am able to use for this instead.
# - Cannot use dataframe_for_MS4_T3_analysis because create_new_column and remove_non_charged_off_rows
# - have been used which alter the dataframe significantly...
#
# What percentage do users' in this bracket currently represent as a percentage of all loans?
###

'''Step2
Calculate the total amount of customers in this bracket and how much loss the company would
incur if their status was changed to Charged Off.
'''

###
###

'''Step3
What is the projected loss of these loans if the customer were to finish the full loans term?
'''

###
###

'''Step4
If customers late on payments converted to Charged Off, what percentage of total expected revenue
do these customers and the customers who have already defaulted on their loan represent?
'''

###
###