# Cleaning Customers Data and writing in parquet format


#### 1. create a dataframe with proper datatypes 

In [0]:
customer_schema = 'member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string, zip_code string, country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string'

In [0]:
customers_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(customer_schema) \
.load("/mnt/Lendingclub/Lendingclub/Lendingclub/raw/customers_data_csv")

In [0]:
display(customers_raw_df.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10+ years,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10+ years,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10+ years,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10+ years,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3 years,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4 years,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10+ years,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10+ years,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6 years,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10+ years,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,


In [0]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



#### 2. Rename a few columns

In [0]:
customer_df_renamed = customers_raw_df.withColumnRenamed("annual_inc", "annual_income") \
.withColumnRenamed("addr_state", "address_state") \
.withColumnRenamed("zip_code", "address_zipcode") \
.withColumnRenamed("country", "address_country") \
.withColumnRenamed("tot_hi_credit_lim", "total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint", "join_annual_income")

In [0]:
display(customer_df_renamed.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10+ years,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10+ years,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10+ years,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10+ years,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3 years,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4 years,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10+ years,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10+ years,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6 years,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10+ years,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,


In [0]:
from pyspark.sql.functions import current_timestamp

#### 3. insert a new column named as ingestion date(current time)

In [0]:
customers_df_ingestd = customer_df_renamed.withColumn("ingest_date", current_timestamp())

In [0]:
display(customers_df_ingestd.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint,ingest_date
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10+ years,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,,2025-08-15T04:35:59.520451Z
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10+ years,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,,2025-08-15T04:35:59.520451Z
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10+ years,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified,2025-08-15T04:35:59.520451Z
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10+ years,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,,2025-08-15T04:35:59.520451Z
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3 years,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,,2025-08-15T04:35:59.520451Z
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4 years,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,,2025-08-15T04:35:59.520451Z
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10+ years,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,,2025-08-15T04:35:59.520451Z
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10+ years,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,,2025-08-15T04:35:59.520451Z
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6 years,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,,2025-08-15T04:35:59.520451Z
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10+ years,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,,2025-08-15T04:35:59.520451Z


#### 4. Remove complete duplicate rows

In [0]:
customers_df_ingestd.count()

2260701

In [0]:
customers_distinct = customers_df_ingestd.distinct()

In [0]:
customers_distinct.count()

2260638

In [0]:
customers_distinct.createOrReplaceTempView("customers")

In [0]:
display(spark.sql("select * from customers").limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint,ingest_date
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10+ years,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,,2025-08-15T04:36:17.11391Z
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10+ years,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified,2025-08-15T04:36:17.11391Z
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10+ years,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,,2025-08-15T04:36:17.11391Z
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10+ years,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,,2025-08-15T04:36:17.11391Z
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4 years,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,,2025-08-15T04:36:17.11391Z
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10+ years,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,,2025-08-15T04:36:17.11391Z
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6 years,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,,2025-08-15T04:36:17.11391Z
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10+ years,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,,2025-08-15T04:36:17.11391Z
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3 years,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,,2025-08-15T04:36:17.11391Z
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10+ years,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,,2025-08-15T04:36:17.11391Z


#### 5. Remove the rows where annual_income is null

In [0]:
display(spark.sql("select count(*) from customers where annual_income is null"))

count(1)
5


In [0]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [0]:
customers_income_filtered.createOrReplaceTempView("customers")

In [0]:
display(spark.sql("select count(*) from customers where annual_income is null"))

count(1)
0


### 6. convert emp_length to integer

In [0]:
display(spark.sql("select distinct(emp_length) from customers"))

emp_length
5 years
9 years
""
1 year
2 years
7 years
8 years
4 years
6 years
3 years


In [0]:
from pyspark.sql.functions import regexp_replace, col

In [0]:
customers_emplength_cleaned = customers_income_filtered.withColumn("emp_length", regexp_replace(col("emp_length"), r"\D+",""))

In [0]:
display(customers_emplength_cleaned.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint,ingest_date
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,,2025-08-15T04:36:34.752724Z
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,,2025-08-15T04:36:34.752724Z
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,,2025-08-15T04:36:34.752724Z
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,,2025-08-15T04:36:34.752724Z
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified,2025-08-15T04:36:34.752724Z
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,,2025-08-15T04:36:34.752724Z
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,,2025-08-15T04:36:34.752724Z
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,,2025-08-15T04:36:34.752724Z
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,,2025-08-15T04:36:34.752724Z
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,,2025-08-15T04:36:34.752724Z


In [0]:
customers_emplength_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [0]:
customers_emplength_casted = customers_emplength_cleaned.withColumn("emp_length", customers_emplength_cleaned.emp_length.try_cast('int'))

In [0]:
display(customers_emplength_casted.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint,ingest_date
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,,2025-08-15T04:36:36.337394Z
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,,2025-08-15T04:36:36.337394Z
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,,2025-08-15T04:36:36.337394Z
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,,2025-08-15T04:36:36.337394Z
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,,2025-08-15T04:36:36.337394Z
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,,2025-08-15T04:36:36.337394Z
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,,2025-08-15T04:36:36.337394Z
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,,2025-08-15T04:36:36.337394Z
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,,2025-08-15T04:36:36.337394Z
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified,2025-08-15T04:36:36.337394Z


In [0]:
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



#### 7. we need to replace all the nulls in emp_length column with average of this column

In [0]:
customers_emplength_casted.filter("emp_length is null").count()

146903

In [0]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [0]:
avg_emp_length = spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers").collect()

In [0]:
display(avg_emp_length)

avg_emp_length
6


In [0]:
avg_emp_duration = avg_emp_length[0][0]

In [0]:
print(avg_emp_duration)

6


In [0]:
customers_emplength_replaced = customers_emplength_casted.na.fill(avg_emp_duration, subset=['emp_length'])

In [0]:
display(customers_emplength_replaced.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint,ingest_date
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,,2025-08-15T04:37:00.012953Z
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,,2025-08-15T04:37:00.012953Z
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,,2025-08-15T04:37:00.012953Z
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,,2025-08-15T04:37:00.012953Z
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,,2025-08-15T04:37:00.012953Z
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified,2025-08-15T04:37:00.012953Z
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,,2025-08-15T04:37:00.012953Z
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,,2025-08-15T04:37:00.012953Z
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,,2025-08-15T04:37:00.012953Z
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,,2025-08-15T04:37:00.012953Z


In [0]:
customers_emplength_replaced.filter("emp_length is null").count()

0

#### 8. Clean the address_state(it should be 2 characters only),replace all others with NA

In [0]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [0]:
display(spark.sql("select distinct(address_state) from customers"))

address_state
SC
AZ
LA
MN
NJ
DC
OR
VA
RI
WY


In [0]:
display(spark.sql("select count(address_state) from customers where length(address_state)>2"))

count(address_state)
254


In [0]:
from pyspark.sql.functions import when, col, length

In [0]:
customers_state_cleaned = customers_emplength_replaced.withColumn(
    "address_state",
    when(length(col("address_state"))> 2, "NA").otherwise(col("address_state"))
)

In [0]:
display(customers_state_cleaned.limit(10))

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,join_annual_income,verification_status_joint,ingest_date
8aef4bb29d609d8d684fd709af965bd690fc9ef3a2691777f38e0c10d622fe97,Veterinary Tecnician,4,RENT,34000.0,GA,300xx,USA,C,C3,Source Verified,16900.0,Individual,,,2025-08-15T04:37:13.584556Z
f74e401c1ab0adf788a7279dd70a27459fcc47733d35eed71149ff371d2a7283,Contract Specialist,3,MORTGAGE,104433.0,PA,174xx,USA,F,F1,Source Verified,439570.0,Individual,,,2025-08-15T04:37:13.584556Z
b5e7938b0a2da4ceaa75ea2e3c111d1e33ca0f944f954b6d65671ef395d21713,Engineer,10,MORTGAGE,65000.0,SD,577xx,USA,C,C1,Not Verified,314017.0,Individual,,,2025-08-15T04:37:13.584556Z
1035c5401b0ca76d02c334b813302a9b72825e70b1659e8bc8ce5eea6c5d1f52,SERVICE MANAGER,6,RENT,85000.0,PA,160xx,USA,A,A2,Not Verified,61099.0,Individual,,,2025-08-15T04:37:13.584556Z
b24d55f21390533c512f78f822153dc247e984dcd0cfe031c341ccef1c689817,road driver,10,MORTGAGE,85000.0,SC,293xx,USA,B,B1,Not Verified,193390.0,Individual,,,2025-08-15T04:37:13.584556Z
6d5091b3fcaaeb4eac37445042b6b79e8b21f16943b9c32c4f5dd74dcb0f2210,leadman,10,MORTGAGE,55000.0,PA,190xx,USA,C,C4,Not Verified,178050.0,Individual,,,2025-08-15T04:37:13.584556Z
538b4653da3b1e8142b50baacf97a57b721eb6ec565f5fb63a1b71d191e9e934,Vice President of Recruiting Operations,10,MORTGAGE,180000.0,MN,550xx,USA,B,B2,Not Verified,388852.0,Individual,,,2025-08-15T04:37:13.584556Z
cab1fa9f533688b0aab7c6be7b6256d15ad1dd5f3c4ca061a49cfeb26901977d,Information Systems Officer,10,MORTGAGE,110000.0,NJ,076xx,USA,C,C5,Source Verified,381215.0,Individual,,,2025-08-15T04:37:13.584556Z
cb0f1777593e77909d0d359a6d9e633f28568500e17806f815395c92f0cd375d,Vendor liaison,10,MORTGAGE,42000.0,RI,029xx,USA,B,B5,Not Verified,256513.0,Individual,,,2025-08-15T04:37:13.584556Z
91060b858433e8a6107be9b76b324e304b5802631fc081b449188fe4b4e32205,truck driver,10,MORTGAGE,63000.0,IL,605xx,USA,B,B4,Not Verified,218418.0,Joint App,71000.0,Not Verified,2025-08-15T04:37:13.584556Z


In [0]:
display(customers_state_cleaned.select("address_state").distinct())

address_state
SC
AZ
LA
MN
NJ
DC
OR
VA
RI
WY


In [0]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/mnt/Lendingclub/Lendingclub/Lendingclub/cleaned/customers_parquet") \
.save()