Using Pyspark to read the dataset and check missing values.

In [1]:
# import packages
from pyspark.sql import SparkSession
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.sql import functions as fn
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
    MulticlassClassificationEvaluator, \
    RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession


spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
# expand the output display to see more columns of a pandas DataFrame
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
# check the shape of df - use this idiots
def size_shape(df):
    
    print("rows:",df.count())
    print("columns:",len(df.columns))

In [2]:
# read the dataset
df = spark.read.csv('C:\\Users\\tigerman381\\Downloads\\lending-club-loan-data\\loan.csv', header=True, inferSchema=True)

In [4]:
# take a look at the first 10 rows
df_pd = df.limit(10).toPandas()
display(df_pd.head())

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,


In [5]:
# check how many records for each loan status
df_bystatus=df.groupby(df.loan_status).count()
df_bystatus.show()

+--------------------+-------+
|         loan_status|  count|
+--------------------+-------+
|          Fully Paid|1041952|
|             Default|     31|
|     In Grace Period|   8952|
|Does not meet the...|   1988|
|         Charged Off| 261654|
|            Oct-2015|      1|
|  Late (31-120 days)|  21897|
|             Current| 919695|
|Does not meet the...|    761|
|   Late (16-30 days)|   3737|
+--------------------+-------+



In [38]:
# check the number of Nan or Null for each column
#As per our objective, Filtering only the columns required
current_df = df.filter(col('loan_status').isin(['Late (31-120 days)','Charged Off','Late (16-30 days)','Current']))
#Renaming the Late values into a single late
current_df = current_df.withColumn("loan_status", \
              when(current_df["loan_status"].isin(['Late (31-120 days)','Late (16-30 days)']),'Late').otherwise(current_df["loan_status"]))
#Our dataset with only the desired columns values (Late,charged off,current)
print(current_df.groupby(current_df.loan_status).count().show())
size_shape(current_df)

+-----------+------+
|loan_status| count|
+-----------+------+
|Charged Off|261654|
|       Late| 25634|
|    Current|919695|
+-----------+------+



In [79]:
#Contains the nas present in all the columns in the dataset. We are planning to remove those coluomns with more than 50%Nas
checkna_pd = current_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in current_df.columns]).toPandas()
checkna_pd

# current_df.select([c for c in checkna_pd.columns])

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,1206983,1206983,0,0,0,0,0,0,0,0,...,1198272,1198271,34,30,1173940,1173941,1173945,1173947,1173950,1173948


In [80]:
#https://stackoverflow.com/questions/51322445/how-to-drop-all-columns-with-null-values-in-a-pyspark-dataframe - include in our references

#Creating a dataframe which has the count of Nas of each column
checkna = current_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in current_df.columns])
#Selecting the columns with NA values not greater than 50%
No_of_rows = current_df.count()
checkna_greater_than_50 = checkna.select([when(fn.col(c)<(No_of_rows*0.50),c).alias(c) for c in checkna.columns])


def drop_null_columns(df):
    """
    This function drops all columns which contain null values.
    :param df: A PySpark DataFrame
    """
    null_counts = df.select([fn.count(fn.when(fn.col(c).isNull()|isnan(fn.col(c)), c)).alias(c) for c in df.columns]).collect()[0].asDict()
    to_drop = [k for k, v in null_counts.items() if v > 0]
    df = df.drop(*to_drop)
    return df

# Using the function to find the list of columns with less than 50%Na values
final_cols = drop_null_columns(checkna_greater_than_50)
current_df1 = current_df.select([fn.col(c) for c in final_cols.columns])
size_shape(current_df)
size_shape(current_df1)
#selecting our dataframe with the final cols


rows: 1206983
columns: 145
rows: 1206983
columns: 102


In [62]:
#Thats it Bois & Gals.. Use current_df1 for the win
current_df1

1206983