# Predicting Serious Delinquency in 2 years - Test Data

In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib

In [2]:
# Read in Data
test = pd.read_csv(r"/Users/emmanuel/Documents/Portfolio Projects/GiveMeSomeCredit/cs-test.csv").drop(['Unnamed: 0'],axis=1)
test.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


## Data Clean & Feature Enigneering

#### Age

In [3]:
# Valid Age Ranges
valid_age_min = 18
valid_age_max = 100

# Create an outlier flag for modelling
test['Age_Outlier'] = ((test['age'] < valid_age_min) | (test['age'] > valid_age_max)).astype(int)

# Replace out-of-range ages with NaN
test.loc[test['Age_Outlier'] == 1, 'age'] = np.nan

# Impute missing ages with median
age_median = test['age'].median()
test['age_imputed'] = test['age'].fillna(age_median)

#### Monthly Income

In [4]:
# Cleaning Monthly Income Ouliers 
high_income = test[test['MonthlyIncome'] > 50000]
p95_hi = high_income['MonthlyIncome'].quantile(0.95)

# Flag for extreme values within high income group
test['ExtremeIncomeFlag'] = 0
test.loc[test['MonthlyIncome'] > p95_hi, 'ExtremeIncomeFlag'] = 1

# Impute those with the global mean 
impute_value = test['MonthlyIncome'].mean()
test['Missing_MonthlyIncome_Imputed'] = np.where(test['ExtremeIncomeFlag'] == 1, impute_value, test['MonthlyIncome'])

#### Revolving Utilsation of Unsecured Lines

In [5]:
# Impute extreme values with mean 
mean_2 = test.loc[test['RevolvingUtilizationOfUnsecuredLines'] <= 2, 'RevolvingUtilizationOfUnsecuredLines'].mean()

test['RevolvingUtilization_cleaned'] = np.where(test['RevolvingUtilizationOfUnsecuredLines'] > 2, mean_2, test['RevolvingUtilizationOfUnsecuredLines'])

#### Debt Ratio

In [6]:
# Handling Outliers
# Cap extreme values at 5
test['DebtRatio_cleaned'] = test['DebtRatio'].clip(upper=5)

#### Number Of Times 90 Days Late Distribution

In [7]:
# Cap values at 10
test['NumberOfTimes90DaysLate'] = test['NumberOfTimes90DaysLate'].clip(upper=6)

#### Number Of Time 60-89 Days Past Due Not Worse Distribution

In [8]:
# Cap extreme values
test['NumberOfTime60-89DaysPastDueNotWorse'] = test['NumberOfTime60-89DaysPastDueNotWorse'].clip(upper=6)

#### NumberOfTime30-59DaysPastDueNotWorse

In [9]:
# Cap extreme values at 6
test['NumberOfTime30-59DaysPastDueNotWorse'] = test['NumberOfTime30-59DaysPastDueNotWorse'].clip(upper=6)

#### Number of Open Credit Lines and Loans

In [10]:
# Cap values at 30
test['NumberOfOpenCreditLinesAndLoans_cleaned'] = test['NumberOfOpenCreditLinesAndLoans'].clip(upper=30)

#### Number Real Estate Loans Or Lines Distribution

In [11]:
# Cap extreme values at 5
test['NumberRealEstateLoansOrLines_cleaned'] = test['NumberRealEstateLoansOrLines'].clip(upper=5)

#### Number Of Dependents Distribution

In [12]:
# Cap values at 5
test['NumberOfDependentsMissing_cleaned'] = test['NumberOfDependents'].clip(upper=5)

#### Total Delinquencies 

In [13]:
# Combined delinquency score
test['TotalDelinquencies'] = (test['NumberOfTime30-59DaysPastDueNotWorse'] + test['NumberOfTime60-89DaysPastDueNotWorse'] +
    test['NumberOfTimes90DaysLate'])

#### Dependents Per Income

In [14]:
test['DependentsPerIncome'] = test['NumberOfDependents'] / (test['MonthlyIncome'] + 1)

In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101503 entries, 0 to 101502
Data columns (total 22 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   SeriousDlqin2yrs                         0 non-null       float64
 1   RevolvingUtilizationOfUnsecuredLines     101503 non-null  float64
 2   age                                      101500 non-null  float64
 3   NumberOfTime30-59DaysPastDueNotWorse     101503 non-null  int64  
 4   DebtRatio                                101503 non-null  float64
 5   MonthlyIncome                            81400 non-null   float64
 6   NumberOfOpenCreditLinesAndLoans          101503 non-null  int64  
 7   NumberOfTimes90DaysLate                  101503 non-null  int64  
 8   NumberRealEstateLoansOrLines             101503 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse     101503 non-null  int64  
 10  NumberOfDependents              

In [16]:
# Final features selected for prediction on Test Data 
test = test[['SeriousDlqin2yrs', 'age_imputed', 'Missing_MonthlyIncome_Imputed', 'RevolvingUtilization_cleaned', 
             'DebtRatio_cleaned', 'NumberOfOpenCreditLinesAndLoans_cleaned', 'NumberRealEstateLoansOrLines_cleaned', 'NumberOfDependentsMissing_cleaned', 
             'DependentsPerIncome', 'TotalDelinquencies']]

### Get Predictions

In [17]:
# Load model for predictions 
path = '/Users/emmanuel/Documents/Portfolio Projects/GiveMeSomeCredit/'
# Read in pickled model
with open(path + 'xgb_random.pickle', 'rb') as to_read:
    model = pickle.load(to_read)

In [21]:
# Create an ID
test["Id"] = test.index + 1
# Drop SeriousDlqin2yrs variable
X_features = test.drop(columns=["SeriousDlqin2yrs"])

In [23]:
# Drop the Id column before prediction
X_no_id = X_features.drop(columns=["Id"], errors="ignore")

# Predict probabilities
y_prob = model.predict_proba(X_no_id)[:, 1]

In [26]:
submission = pd.DataFrame({
    "Id": X_features["Id"],
    "Probability": y_prob
})
submission.to_csv("/Users/emmanuel/Documents/Portfolio Projects/GiveMeSomeCredit/submission.csv", index=False)

In [27]:
print("Rows in test:", len(X_features))
print("Rows in submission:", len(submission))
print("Duplicate Ids:", submission["Id"].duplicated().sum())
print("Missing Probabilities:", submission["Probability"].isna().sum())

Rows in test: 101503
Rows in submission: 101503
Duplicate Ids: 0
Missing Probabilities: 0


In [30]:
# Basic structure
assert list(submission.columns) == ["Id", "Probability"], "Columns must be exactly ['Id','Probability']"

# Lengths match
assert len(submission) == len(test), f"Row count mismatch: sub={len(submission)} vs test={len(test)}"

# Id checks
assert submission["Id"].is_unique, "Duplicate Ids found"
assert submission["Id"].min() == 1 and submission["Id"].max() == len(submission), "Id must be 1..N sequential"

# Probability checks
assert submission["Probability"].isna().sum() == 0, "Missing probabilities detected"
assert (submission["Probability"].between(0, 1)).all(), "Probabilities must be within [0,1]"