In [2]:
import pandas as pd

# Load the data
file_path = 'Loan 2.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data
print(data.head())


  Customer ID               Name Gender  Age  Income (USD) Income Stability  \
0     C-36995   Frederica Shealy      F   56       1933.05              Low   
1     C-33999  America Calderone      M   32       4952.91              Low   
2      C-3770      Rosetta Verne      F   65        988.19             High   
3     C-26480         Zoe Chitty      F   65           NaN             High   
4     C-23459       Afton Venema      F   31       2614.77              Low   

  Profession     Type of Employment    Location  Loan Amount Request (USD)  \
0    Working            Sales staff  Semi-Urban                   72809.58   
1    Working                    NaN  Semi-Urban                   46837.47   
2  Pensioner                    NaN  Semi-Urban                   45593.04   
3  Pensioner                    NaN       Rural                   80057.92   
4    Working  High skill tech staff  Semi-Urban                  113858.89   

   ...  Credit Score No. of Defaults Has Active Credit C

In [3]:
# Get basic information about the data
print(data.info())

# Get statistical summary of the data
print(data.describe())

# Check for null values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer ID                  30000 non-null  object 
 1   Name                         30000 non-null  object 
 2   Gender                       29947 non-null  object 
 3   Age                          30000 non-null  int64  
 4   Income (USD)                 25424 non-null  float64
 5   Income Stability             28317 non-null  object 
 6   Profession                   30000 non-null  object 
 7   Type of Employment           22730 non-null  object 
 8   Location                     30000 non-null  object 
 9   Loan Amount Request (USD)    30000 non-null  float64
 10  Current Loan Expenses (USD)  29828 non-null  float64
 11  Expense Type 1               30000 non-null  object 
 12  Expense Type 2               30000 non-null  object 
 13  Dependents      

In [4]:
# Fill null values for numerical columns with median
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    data[column].fillna(data[column].median(), inplace=True)

# Fill null values for categorical columns with mode
for column in data.select_dtypes(include=['object']).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Verify null values are handled
print(data.isnull().sum())


Customer ID                    0
Name                           0
Gender                         0
Age                            0
Income (USD)                   0
Income Stability               0
Profession                     0
Type of Employment             0
Location                       0
Loan Amount Request (USD)      0
Current Loan Expenses (USD)    0
Expense Type 1                 0
Expense Type 2                 0
Dependents                     0
Credit Score                   0
No. of Defaults                0
Has Active Credit Card         0
Property ID                    0
Property Age                   0
Property Type                  0
Property Location              0
Co-Applicant                   0
Property Price                 0
Loan Sanction Amount (USD)     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [5]:
# Function to remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply the function to each numerical column
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    data = remove_outliers_iqr(data, column)

# Display the data after removing outliers
print(data.describe())


                Age  Income (USD)  Loan Amount Request (USD)  \
count  16968.000000  16968.000000               16968.000000   
mean      39.127416   2162.684262               76320.368761   
std       15.688570    697.519946               44747.602671   
min       18.000000    377.700000                6108.050000   
25%       24.000000   1670.475000               39116.837500   
50%       39.000000   2222.435000               68607.700000   
75%       53.000000   2515.915000              105365.145000   
max       65.000000   4378.180000              228220.460000   

       Current Loan Expenses (USD)    Dependents  Credit Score  \
count                 16968.000000  16968.000000  16968.000000   
mean                    365.290167      2.206742    736.771743   
std                     165.428449      0.853187     67.289045   
min                      33.760000      1.000000    580.850000   
25%                     236.337500      2.000000    684.887500   
50%                     352

In [8]:
# Fill null values for numerical columns with median
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    data[column].fillna(data[column].median(), inplace=True)

# Fill null values for categorical columns with mode
for column in data.select_dtypes(include=['object']).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Verify null values are handled
null_values_after = data.isnull().sum()
null_values_after


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Customer ID                    0
Name                           0
Gender                         0
Age                            0
Income (USD)                   0
Income Stability               0
Profession                     0
Type of Employment             0
Location                       0
Loan Amount Request (USD)      0
Current Loan Expenses (USD)    0
Expense Type 1                 0
Expense Type 2                 0
Dependents                     0
Credit Score                   0
No. of Defaults                0
Has Active Credit Card         0
Property ID                    0
Property Age                   0
Property Type                  0
Property Location              0
Co-Applicant                   0
Property Price                 0
Loan Sanction Amount (USD)     0
dtype: int64

In [9]:
# Function to remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply the function to each numerical column
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    data = remove_outliers_iqr(data, column)

# Display the data after removing outliers
data_after_outliers = data.describe()
data_after_outliers


Unnamed: 0,Age,Income (USD),Loan Amount Request (USD),Current Loan Expenses (USD),Dependents,Credit Score,No. of Defaults,Property ID,Property Age,Property Type,Co-Applicant,Property Price,Loan Sanction Amount (USD)
count,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0,15890.0
mean,39.138011,2098.318277,72465.174687,351.725171,2.204468,735.915064,0.0,501.888483,2100.010986,2.45387,1.0,106426.180276,41604.798665
std,15.690879,638.253844,41416.205829,155.085601,0.856376,67.248515,0.0,287.161871,634.762799,1.119143,0.0,63767.584119,34232.305441
min,18.0,472.04,6108.05,33.76,1.0,580.85,0.0,1.0,472.04,1.0,1.0,-999.0,-999.0
25%,24.0,1642.635,38449.5225,229.9475,2.0,683.925,0.0,252.0,1647.6225,1.0,1.0,55208.45,14886.685
50%,39.0,2220.215,66282.645,342.47,2.0,739.82,0.0,502.0,2223.25,2.0,1.0,94256.52,35209.395
75%,53.0,2443.1275,99716.7,449.9475,3.0,787.2125,0.0,750.75,2437.7925,3.0,1.0,146937.7825,63991.0975
max,65.0,3626.62,195086.78,787.63,4.0,890.02,0.0,999.0,3626.62,4.0,1.0,287329.98,137638.12


In [10]:
# 'data' is our processed DataFrame
output_file_path = 'processed_loan_data.csv'

# Save the DataFrame to a CSV file
data.to_csv(output_file_path, index=False)

print(f"Processed data has been saved to {output_file_path}")


Processed data has been saved to processed_loan_data.csv
