In [17]:
import pyodbc
import pandas as pd 
import numpy as np 
import configparser
import traceback


#Display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [19]:
config = configparser.ConfigParser()
config_path = '../config.ini'

try:
    config.read(config_path)
    db_config = config['database']
    
    server = db_config['server']
    database = db_config['database']
    driver = db_config['driver']
    trusted_connection = db_config.getboolean('trusted_connection', fallback=False)
    
    conn_str_parts = [
        f"Driver={{{driver}}}",
        f"Server={{{server}}}",
        f"Database={{{database}}}"
    ]
    
    # ... (inside the try block for config parsing)
    trust_server_cert_str = db_config.get('trust_server_certificate', 'no') # Default to 'no'
    trust_server_certificate = configparser.ConfigParser.BOOLEAN_STATES.get(trust_server_cert_str.lower(), False)
    print(f"Trust Server Certificate: {trust_server_certificate} (from string '{trust_server_cert_str}')")

    if trust_server_certificate:
        conn_str_parts.append("TrustServerCertificate=yes")
    # ...
    # Debug print the final connection string (mask password if necessary)
    log_conn_str_parts = list(conn_str_parts)
    for i, part in enumerate(log_conn_str_parts):
        if part.lower().startswith("pwd="):
            log_conn_str_parts[i] = "PWD=********"
        print(f"Final Connection String Being Used (Password Masked): {';'.join(log_conn_str_parts)}")
    else:
        uid = db_config.get('uid')
        pwd = db_config.get('pwd')
        if not uid or not pwd:
            raise ValueError("UID and PWD must be provided in config.ini if Trusted_Connection = 'no'")
        conn_str_parts.append(f"UID={uid}")
        conn_str_parts.append(f"PWD={pwd}")
    
    if db_config.getboolean('trust_server_certificate', fallback=False):
        conn_str_parts.append("TrustedServerCertificate=yes")
    
    print(f"DEBUG: conn_str_parts before join: {conn_str_parts}")
    conn_str = ";".join(conn_str_parts)
    print("Connection string configured (password hidden if applicable).")

except Exception as e:
    print(f"Error loading configuration or Building connection string: {e}")
    conn_str = None


Trust Server Certificate: True (from string 'yes')
Final Connection String Being Used (Password Masked): Driver={ODBC Driver 18 for SQL Server};Server={192.168.100.30};Database={QuickPesaDB};TrustServerCertificate=yes
Final Connection String Being Used (Password Masked): Driver={ODBC Driver 18 for SQL Server};Server={192.168.100.30};Database={QuickPesaDB};TrustServerCertificate=yes
Final Connection String Being Used (Password Masked): Driver={ODBC Driver 18 for SQL Server};Server={192.168.100.30};Database={QuickPesaDB};TrustServerCertificate=yes
Final Connection String Being Used (Password Masked): Driver={ODBC Driver 18 for SQL Server};Server={192.168.100.30};Database={QuickPesaDB};TrustServerCertificate=yes
DEBUG: conn_str_parts before join: ['Driver={ODBC Driver 18 for SQL Server}', 'Server={192.168.100.30}', 'Database={QuickPesaDB}', 'TrustServerCertificate=yes', 'UID=sa', 'PWD=BLOOMberg411**', 'TrustedServerCertificate=yes']
Connection string configured (password hidden if applica

In [21]:
sql_query = ""
try:
    with open('../sql_queries/main_data_extraction.sql', 'r') as file:
        sql_query = file.read()
except FileNotFoundError:
    print("Error: SQL query file not found. Make sure 'main_data_extraction.sql' is in the 'sql_queries' directory.")
    sql_query = None
    
df = None
if conn_str and sql_query:
    try:
        conn = pyodbc.connect(conn_str)
        print("Database connection successful")
        
        df = pd.read_sql(sql_query, conn)
        conn.close()
        print(f"Data extracted successfully. Shape: {df.shape}")
        print(df.head())
    
    except pyodbc.Error as ec:
        sqlstate = ec.args[0]
        print(f"Database execution error: {sqlstate}")
        print(ec)
    except Exception as e:
        print(f"An error occurred during pd.read_sql or connection closing: {type(e).__name__} - {e}")
        traceback.print_exc()

else:
    if not conn_str:
        print("Database connection string not configured. Skipping data extraction.")
    if not sql_query:
        print("SQL query is empty or file not read. Skipping data extraction.")
        
if df is not None:
    try: 
        df.to_csv('../data/raw_loan_data.csv', index=False)
        print("Raw data saved to data/raw_loan_data.csv")
    except Exception as e:
        print(f"Error saving raw data: {e}")
else:
    print("DataFrame is None. Cannot save.")


Database connection successful


  df = pd.read_sql(sql_query, conn)


Data extracted successfully. Shape: (565165, 47)
   LoanID  LoanPrincipal  TotalRepayable LoanStatus        DisbursementDate  \
0  160733          200.0          226.33  Defaulted 2024-02-23 01:09:08.657   
1  160739          600.0          679.00       Paid 2024-02-23 05:33:27.657   
2  160742          400.0          452.67       Paid 2024-02-23 02:03:09.657   
3  160746          600.0          679.00  Defaulted 2024-02-23 03:41:32.657   
4  160749          600.0          679.00       Paid 2024-02-23 02:45:08.657   

                  DueDate  LoanTermActualDays  DaysDelayed  \
0 2024-03-01 01:09:08.657                   7          449   
1 2024-03-01 05:33:27.657                   7          -11   
2 2024-03-01 02:03:09.657                   7           -1   
3 2024-03-01 03:41:32.657                   7          449   
4 2024-03-01 02:45:08.657                   7            1   

          ApplicationDate  AmountRequested  LoanTermRequestedDays  \
0 2024-02-22 23:54:08.657         

In [25]:
if df is not None:
    print("Data Info:")
    df.info()
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nDescriptive Statistics:")
    print(df.describe(include='all'))
else:
    print("Dataframe is not loaded. Cannot perform inspection.")

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565165 entries, 0 to 565164
Data columns (total 47 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   LoanID                       565165 non-null  int64         
 1   LoanPrincipal                565165 non-null  float64       
 2   TotalRepayable               565165 non-null  float64       
 3   LoanStatus                   565165 non-null  object        
 4   DisbursementDate             565165 non-null  datetime64[ns]
 5   DueDate                      565165 non-null  datetime64[ns]
 6   LoanTermActualDays           565165 non-null  int64         
 7   DaysDelayed                  565165 non-null  int64         
 8   ApplicationDate              565165 non-null  datetime64[ns]
 9   AmountRequested              565165 non-null  float64       
 10  LoanTermRequestedDays        565165 non-null  int64         
 11  LoanPurpose    

In [None]:
if df is not None:
    # Convert 'Loan Status' to a binary column 'IsDefault' with 'Defaulted' as 1 and 'Paid' as 0
    df['IsDefault'] = df['LoanStatus'].apply(lambda x: 1 if x == 'Defaulted' else 0)
    print("\nValue counts for IsDefault:")
    print(df['IsDefault'].value_counts(normalize=True))


Value counts for IsDefault:
IsDefault
1    0.757243
0    0.242757
Name: proportion, dtype: float64


In [27]:
if df is not None:
    if 'AvgDepositLast6Months' in df.columns:
        if df['AvgDepositLast6Months'].isnull().any():
            df['AvgDepositLast6Months_ImputedFlag'] = df['AvgDepositLast6Months'].isnull().astype(int)
            df['AvgDepositLast6Months'].fillna(0, inplace=True)
            print(f"\nMissing 'AvgDepositLast6Months' after imputation: {df['AvgDepositLast6Months'].isnull().sum()}")
            print(f"Number of imputations for 'AvgDepositLast6Months': {df['AvgDepositLast6Months_ImputedFlag'].sum()}")
        else:
            print("\nNo missing values found in 'AvgDepositLast6Months'.")
    else:
        print("\nWarning: 'AvgDepositLast6Months' column not found in Dataframe.")
    


Missing 'AvgDepositLast6Months' after imputation: 0
Number of imputations for 'AvgDepositLast6Months': 466017


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['AvgDepositLast6Months'].fillna(0, inplace=True)


In [29]:
if df is not None:
    #Debt-To-Income Ratio(DTI)
    df['DTI'] = np.where(df['MonthlyIncome'] > 0, df['AmountRequested'] / df['MonthlyIncome'], np.nan)
    df['DTI'].fillna(df['DTI'].median(), inplace=True)
    
    #Loan Amount to Max ELigible Amount Ratio
    df['LoanToMaxEligibleRatio'] = np.where(df['MaxEligibleLoanAmount'] > 0, df['AmountRequested']/ df['MaxEligibleLoanAmount'], np.nan)
    df['LoanToMaxEligibleRatio'].fillna(df['LoanToMaxEligibleRatio'].median(), inplace=True)
    
    # Time since last default (in days, at time of application)
    # Convert date columns to datetime objects first
    df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'])
    df['PreviousLastDefaultDate'] = pd.to_datetime(df['PreviousLastDefaultDate'])
    df['DaysSinceLastDefault'] = (df['ApplicationDate'] - df['PreviousLastDefaultDate']).dt.days
    df['DaysSinceLastDefault'].fillna(365*10, inplace=True)
    
    #Customer Tenure (in days, at time of application)
    df['CustomerRegistrationDate'] = pd.to_datetime(df['CustomerRegistrationDate'])
    df['CustomerTenureDays'] = (df['ApplicationDate'] - df['CustomerRegistrationDate']).dt.days
    df['CustomerTenureDays'].fillna(0, inplace=True) #if registration date is missing
    
    print("\nEngineered features created. df.head():")
    print(df[['DTI', 'LoanToMaxEligibleRatio', 'DaysSinceLastDefault', 'CustomerTenureDays']].head())
    
    
    


Engineered features created. df.head():
        DTI  LoanToMaxEligibleRatio  DaysSinceLastDefault  CustomerTenureDays
0  0.028920                    0.20                 44144                -356
1  0.018875                    0.60                 45203                 557
2  0.023443                    0.40                 44667                 362
3  0.083641                    0.60                 43272                -132
4  0.003428                    0.24                 43806                -251


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DTI'].fillna(df['DTI'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LoanToMaxEligibleRatio'].fillna(df['LoanToMaxEligibleRatio'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte

In [30]:
if df is not None:
    df.to_csv('../data/processed_loan_data.csv', index=False)
    print("Processed data saved to /processed_loan_data.csv")

Processed data saved to /processed_loan_data.csv
