# Project-2 Scoring Function

# Name: Akhilender Kaukuntla
Net-ID: AXK220104

In [1]:
def project_2_scoring (input_df, model_path, encoders_path):
    import pandas as pd
    import joblib
    import h2o
    import category_encoders as ce
    import numpy as np
    from sklearn.model_selection import train_test_split


    h2o.init()

    h2o_model = h2o.load_model(model_path)

    encoder = joblib.load(encoders_path)    

    SBA = pd.read_csv(input_df)
    cat_cols = ['City', 'State', 'Bank', 'BankState', 'RevLineCr', 'LowDoc']
    num_cols = ['Zip', 'NAICS', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural']
    SBA[cat_cols] = SBA[cat_cols].fillna("missing")
    SBA[num_cols] = SBA[num_cols].fillna(0)

    print(SBA.isnull().sum())
    cat_cols = ['City', 'State', 'Bank', 'BankState', 'RevLineCr', 'LowDoc']
    #encoder.fit(SBA[cat_cols], SBA['MIS_Status'])
    SBA_trg_enc = encoder.transform(SBA[cat_cols]).add_suffix('_trg')
    # merge encoded data with original data
    SBA = pd.concat([SBA.drop(cat_cols, axis=1), SBA_trg_enc], axis=1)

    # Now we can generate 10 new features 
    # # 1. Interaction Features
    SBA['Noemp_createJob'] = SBA['NoEmp'] * SBA['CreateJob']
    # 2. Logarithmic Transformation
    SBA['LogGrAppv'] = np.log(SBA['GrAppv'] + 1)  # Adding 1 to handle zero values
    # 3. Ratio Feature
    SBA['SBA_Appv_to_GrAppv_Ratio'] = SBA['SBA_Appv'] / SBA['GrAppv']
    #4. Aggregated Statistics (mean 'DisbursementGross' by 'City')
    mean_disbursement_by_city = SBA.groupby('City_trg')['DisbursementGross'].mean()
    SBA['MeanDisbursementByCity'] = SBA['City_trg'].map(mean_disbursement_by_city)
    #5. Combined Features
    SBA['Emp_Job_Combined'] = SBA['NoEmp'] + SBA['CreateJob']
    #6. Difference Feature
    SBA['GrAppv_minus_SBA_Appv'] = SBA['GrAppv'] - SBA['SBA_Appv']
    # 7. Polynomial Features (Squared features for selected columns)
    squared_columns = ['NoEmp', 'CreateJob', 'DisbursementGross', 'GrAppv']
    for column in squared_columns:
      SBA[f'{column}_squared'] = SBA[column] ** 2
    #8. Binary Indicator Features (based on 'RevLineCr' and 'LowDoc')
    SBA['RevLineCr_Indicator'] = (SBA['RevLineCr_trg'] == 'Y').astype(int)
    SBA['LowDoc_Indicator'] = (SBA['LowDoc_trg'] == 'Y').astype(int)
    # 9. Aggregated Statistics (mean 'GrAppv' by 'NAICS')
    mean_grappv_by_naics = SBA.groupby('NAICS')['GrAppv'].mean()
    SBA['MeanGrAppvByNAICS'] = SBA['NAICS'].map(mean_grappv_by_naics)
    # 10. Count-Based Feature (count of 'State' occurrences)
    state_count = SBA['State_trg'].value_counts()
    SBA['State_Count'] = SBA['State_trg'].map(state_count)

    X = SBA

    X_h2o = h2o.H2OFrame(X)

    predicted = h2o_model.predict(X_h2o)
    predicted['class_probability_0'] = 1 - predicted['predict']
    predict_class = X_h2o['index']
    predict_class = predict_class.cbind(predicted)
    predict_class = predict_class.set_names(['index','class_probability_1','class_probability_0'])
    predicted_class = predict_class.as_data_frame()
    
    return predicted_class


    

# Scoring Function on SBA_loans_project_2.csv

In [2]:
project_2_scoring('/Users/akhilenderk/Desktop/Applied_Machine_learning/Projects/Project-2/SBA_loans_project_2.csv',
                  '/Users/akhilenderk/Desktop/Applied_Machine_learning/Projects/Project-2/Akhilender_Kaukuntla_AXK220104_Deliverables_Project2/Artifacts/Grid_GBM_Key_Frame__upload_91dc93645f3a6cef7b0cca725aeb4788.hex_model_python_1702255793028_5_model_51',
                  '/Users/akhilenderk/Desktop/Applied_Machine_learning/Projects/Project-2/Akhilender_Kaukuntla_AXK220104_Deliverables_Project2/Artifacts/encoder.pkl')

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,1 hour 22 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,4 months and 15 days
H2O_cluster_name:,H2O_from_python_akhilenderk_7ptbr2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,935 Mb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


index                0
City                 0
State                0
Zip                  0
Bank                 0
BankState            0
NAICS                0
NoEmp                0
NewExist             0
CreateJob            0
RetainedJob          0
FranchiseCode        0
UrbanRural           0
RevLineCr            0
LowDoc               0
DisbursementGross    0
BalanceGross         0
GrAppv               0
SBA_Appv             0
MIS_Status           0
dtype: int64


  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,index,class_probability_1,class_probability_0
0,0,0.167026,0.832974
1,1,0.311623,0.688377
2,2,0.092169,0.907831
3,3,0.260068,0.739932
4,4,0.391468,0.608532
...,...,...,...
800250,800250,0.027149,0.972851
800251,800251,0.112420,0.887580
800252,800252,0.224912,0.775088
800253,800253,0.180215,0.819785


# Scoring Function on SBA_loans_project_2_holdout_students_valid.csv

In [5]:
project_2_scoring('/Users/akhilenderk/Desktop/Applied_Machine_learning/Projects/Project-2/SBA_loans_project_2_holdout_students_valid.csv',
                  '/Users/akhilenderk/Desktop/Applied_Machine_learning/Projects/Project-2/Akhilender_Kaukuntla_AXK220104_Deliverables_Project2/Artifacts/Grid_GBM_Key_Frame__upload_91dc93645f3a6cef7b0cca725aeb4788.hex_model_python_1702255793028_5_model_51',
                  '/Users/akhilenderk/Desktop/Applied_Machine_learning/Projects/Project-2/Akhilender_Kaukuntla_AXK220104_Deliverables_Project2/Artifacts/encoder.pkl')

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,1 hour 26 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,4 months and 15 days
H2O_cluster_name:,H2O_from_python_akhilenderk_7ptbr2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,781 Mb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


index                0
City                 0
State                0
Zip                  0
Bank                 0
BankState            0
NAICS                0
NoEmp                0
NewExist             0
CreateJob            0
RetainedJob          0
FranchiseCode        0
UrbanRural           0
RevLineCr            0
LowDoc               0
DisbursementGross    0
BalanceGross         0
GrAppv               0
SBA_Appv             0
dtype: int64


  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,index,class_probability_1,class_probability_0
0,0,0.035884,0.964116
1,1,0.157841,0.842159
2,2,0.022246,0.977754
3,3,0.067282,0.932718
4,4,0.184813,0.815187
...,...,...,...
98904,98904,0.131777,0.868223
98905,98905,0.399848,0.600152
98906,98906,0.070868,0.929132
98907,98907,0.086452,0.913548
