In [1]:
import pandas as pd

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, countDistinct,sum
import pyspark.sql.functions as F

In [7]:
spark = SparkSession.builder.appName("LoanDataQuality").getOrCreate()

In [9]:
df_loans = spark.read.option('header','true').csv("LCloan_raw_data.csv")

In [11]:
mapping_file = pd.read_csv('new_loans_mapping.csv')

In [86]:
mapping_file.columns


Index(['column_name', 'Description', 'source_sheet', 'type_status',
       'column_type', 'is_required', 'actual_column_in_loans',
       'business_category'],
      dtype='object')

In [28]:
required_cols =mapping_file[mapping_file['is_required'] == True]

In [30]:
required_cols['business_category'].unique()

array(['credit_info', 'dates', 'loan_info', 'financial_metrics',
       'payment_info', 'other', 'employment_info', 'special_cases'],
      dtype=object)

### Mapping tests functions

In [89]:
from PySpark_testing import *

In [91]:
def map_test_functions(business_category, column_type):
    """
    mapping tetsing functions acoording each column.
    
    :param business_category: קטגוריה עסקית (למשל, 'dates', 'financial_metrics', 'loan_info', 'credit_info', 
                              'employment_info', 'special_cases', 'other')
    :param column_type: סוג העמודה (למשל, 'string', 'int', 'double')
    :return: רשימת שמות בדיקות רלוונטיות
    """
    tests = []
    
    # בדיקות כלליות שחלות על כל העמודות
    tests.append('check_nulls')
    tests.append('check_consistency')
    tests.append('dtype_consistency')
    tests.append('count_records')
    
    # בדיקות לפי סוג העמודה
    if column_type in ['int', 'double']:
        tests.append('validate_numeric_range')
        tests.append('validate_sum')
        tests.append('check_duplicates')
    elif column_type == 'string':
        tests.append('validate_allowed_values')
        tests.append('validate_values_length')
    
    # בדיקות שמבוססות על הקטגוריה העסקית
    if business_category == 'dates':
        tests.append('validate_date_format')
        tests.append('validate_date_range')
        tests.append('file_timeliness')
    elif business_category == 'financial_metrics':
        tests.append('moving_average')
    elif business_category == 'loan_info':
        tests.append('check_foreign_key')
    elif business_category == 'credit_info':
        # לדוגמה, ניתן להוסיף בדיקות ספציפיות לאיכות מידע בתחום האשראי
        tests.append('check_duplicates')
    elif business_category == 'special_cases':
        tests.append('duplicated_files_by_name_and_size')
        tests.append('find_duplicate_columns')
    
    # ניתן להוסיף עוד התאמות בהתאם לניסיון ולדרישות העסקיות
    
    # הסרת כפילויות (אם קיימות) והחזרת רשימה ייחודית
    return list(set(tests))



In [103]:

# דוגמה לשימוש:
# נניח שיש לך DataFrame בשם loans_mapping שמכיל את העמודות 'business_category' ו-'column_type'
mapping_file['test_function'] = mapping_file.apply(
    lambda row: map_test_functions(row['business_category'], row['column_type']),
    axis=1
)



In [105]:
mapping_file

Unnamed: 0,column_name,Description,source_sheet,type_status,column_type,is_required,actual_column_in_loans,business_category,test_function
0,amount requested,The total amount requested by the borrower,RejectStats,not_in_file,,False,,financial_metrics,"[count_records, check_nulls, dtype_consistency..."
1,application date,The date which the borrower applied,RejectStats,not_in_file,,False,,dates,"[count_records, check_nulls, validate_date_ran..."
2,loan title,The loan title provided by the borrower,RejectStats,not_in_file,,False,,loan_info,"[count_records, check_nulls, check_foreign_key..."
3,risk_score,"For applications prior to November 5, 2013 the...",RejectStats,not_in_file,,False,,loan_info,"[count_records, check_nulls, check_foreign_key..."
4,debt-to-income ratio,A ratio calculated using the borrower’s total ...,RejectStats,not_in_file,,False,,loan_info,"[count_records, check_nulls, check_foreign_key..."
...,...,...,...,...,...,...,...,...,...
275,sec_app_num_rev_accts,Number of revolving accounts at time of appli...,BrowseNotes,found,int,True,sec_app_num_rev_accts,loan_info,"[validate_sum, count_records, check_duplicates..."
276,sec_app_chargeoff_within_12_mths,Number of charge-offs within last 12 months a...,BrowseNotes,found,int,True,sec_app_chargeoff_within_12_mths,loan_info,"[validate_sum, count_records, check_duplicates..."
277,sec_app_collections_12_mths_ex_med,Number of collections within last 12 months e...,BrowseNotes,found,int,True,sec_app_collections_12_mths_ex_med,loan_info,"[validate_sum, count_records, check_duplicates..."
278,sec_app_mths_since_last_major_derog,Months since most recent 90-day or worse rati...,BrowseNotes,found,int,True,sec_app_mths_since_last_major_derog,loan_info,"[validate_sum, count_records, check_duplicates..."


In [109]:
def map_test_types(business_category, column_type):
    tests = []
    
    # general tests (all columns)
    tests.append('stability')
    tests.append('completeness')
    
    #mapping according business category
    if business_category == 'funds':
        if column_type in ['int', 'double']:
            tests.append('uniqueness')
            tests.append('validity')
        else:
            tests.append('validity')
    
    elif business_category == 'dates':
        tests.append('date_format_validation')
        tests.append('timeliness')
    
    elif business_category == 'loan_info':
        tests.append('accuracy')
    
    elif business_category == 'credit_info':
        tests.append('credit_score_consistency')
    
    elif business_category == 'employment_info':
        tests.append('consistency')
    
    elif business_category == 'special_cases':
        tests.append('custom_validation')
    
    elif business_category == 'other':
        tests.append('basic_validation')
    
    
    # לדוגמה, אם העמודה היא מספרית (int/double) נוסיף בדיקות נוספות.
    if column_type in ['int', 'double']:
        tests.append('range_check')
    
    return tests


In [111]:
mapping_file['test_type'] = mapping_file.apply(
    lambda row: map_test_types(row['business_category'], row['column_type']),
    axis=1
)


In [113]:
mapping_file

Unnamed: 0,column_name,Description,source_sheet,type_status,column_type,is_required,actual_column_in_loans,business_category,test_function,test_type
0,amount requested,The total amount requested by the borrower,RejectStats,not_in_file,,False,,financial_metrics,"[count_records, check_nulls, dtype_consistency...","[stability, completeness]"
1,application date,The date which the borrower applied,RejectStats,not_in_file,,False,,dates,"[count_records, check_nulls, validate_date_ran...","[stability, completeness, date_format_validati..."
2,loan title,The loan title provided by the borrower,RejectStats,not_in_file,,False,,loan_info,"[count_records, check_nulls, check_foreign_key...","[stability, completeness, accuracy]"
3,risk_score,"For applications prior to November 5, 2013 the...",RejectStats,not_in_file,,False,,loan_info,"[count_records, check_nulls, check_foreign_key...","[stability, completeness, accuracy]"
4,debt-to-income ratio,A ratio calculated using the borrower’s total ...,RejectStats,not_in_file,,False,,loan_info,"[count_records, check_nulls, check_foreign_key...","[stability, completeness, accuracy]"
...,...,...,...,...,...,...,...,...,...,...
275,sec_app_num_rev_accts,Number of revolving accounts at time of appli...,BrowseNotes,found,int,True,sec_app_num_rev_accts,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"
276,sec_app_chargeoff_within_12_mths,Number of charge-offs within last 12 months a...,BrowseNotes,found,int,True,sec_app_chargeoff_within_12_mths,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"
277,sec_app_collections_12_mths_ex_med,Number of collections within last 12 months e...,BrowseNotes,found,int,True,sec_app_collections_12_mths_ex_med,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"
278,sec_app_mths_since_last_major_derog,Months since most recent 90-day or worse rati...,BrowseNotes,found,int,True,sec_app_mths_since_last_major_derog,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"


In [117]:
#mapping_file.to_csv("full_mapping.csv", index=False, encoding='utf-8')


In [119]:
focused_mapping = mapping_file.loc[
    mapping_file['is_required'] == True, 
    ['column_name', 'column_type', 'business_category', 'test_function', 'test_type']
]

In [125]:
#focused_mapping.to_csv('focused_mapping.csv', index=False,encoding='utf-8')

In [127]:
focused_mapping

Unnamed: 0,column_name,column_type,business_category,test_function,test_type
9,acc_now_delinq,string,credit_info,"[count_records, validate_values_length, check_...","[stability, completeness, credit_score_consist..."
10,acc_open_past_24mths,int,dates,"[validate_sum, count_records, check_duplicates...","[stability, completeness, date_format_validati..."
11,addr_state,string,loan_info,"[count_records, validate_values_length, check_...","[stability, completeness, accuracy]"
12,all_util,string,credit_info,"[count_records, validate_values_length, check_...","[stability, completeness, credit_score_consist..."
13,annual_inc,string,financial_metrics,"[count_records, validate_values_length, moving...","[stability, completeness]"
...,...,...,...,...,...
275,sec_app_num_rev_accts,int,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"
276,sec_app_chargeoff_within_12_mths,int,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"
277,sec_app_collections_12_mths_ex_med,int,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"
278,sec_app_mths_since_last_major_derog,int,loan_info,"[validate_sum, count_records, check_duplicates...","[stability, completeness, accuracy, range_check]"


In [167]:
filtered_df = focused_mapping[focused_mapping['test_function'].apply(lambda x: 'check_duplicates' in x)]
print(filtered_df['column_name'])


9                           acc_now_delinq
10                    acc_open_past_24mths
12                                all_util
17                          bc_open_to_buy
18                                 bc_util
                      ...                 
274                    sec_app_open_act_il
275                  sec_app_num_rev_accts
276       sec_app_chargeoff_within_12_mths
277     sec_app_collections_12_mths_ex_med
278    sec_app_mths_since_last_major_derog
Name: column_name, Length: 105, dtype: object


### After mapping tests, checking each test working

In [159]:
check_nulls(df_loans,'sec_app_num_rev_accts')
check_nulls(df_loans,'all_util')

(True, "Column 'all_util' contains 866341 null value(s).")

In [165]:
count_records(df_loans,58888)

(False,
 'Num of records in the file: 2260668Mismatch: the DatatFrame has 2260668 records, but 58888 were expected.')

In [None]:
check_duplicates(df_loans,['sec_app_chargeoff_within_12_mths','bc_open_to_buy'])

In [173]:
find_duplicate_columns(df_loans)

AssertionError: 

In [172]:
spark.stop()

In [None]:
check_consistency(df1, column_1, df2, column_2, key)

In [None]:
check_foreign_key(df_fk, column_fk, df_ref, ref_column)

In [None]:
dtype_consistency(df: DataFrame, column_name: str, expected_type: DataType) 

In [None]:
validate_numeric_range(df, column, min_value, max_value)

In [None]:
 validate_values_length(df,column_name, data_type)

In [None]:
validate_allowed_values(df: DataFrame, column_name: str, allowed_values: list)

In [None]:
validate_date_format(df, column, expected_format)

In [None]:
file_timeliness(file_path, hours_threshold=24)

In [None]:
 validate_date_range(df, column, duration, time_unit)

In [None]:
moving_average(df, column, partitioned_column,date_column, tolerance_deviation, window_size):