 # Exploratory Data Analysis (EDA)

## Imports

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, round
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Functions

In [8]:

def feature_vs_target(df, feature):
    target = "MIS_Status"
    # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
    percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("Percentage"))

    # Round percentage values to two decimal places
    percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

    # Show result
    percentage_df.show()

    # Convert DataFrame to list
    data = percentage_df.collect()

    # Separate data by target
    data_0 = [row['Percentage'] for row in data if row[target] == 0]
    data_1 = [row['Percentage'] for row in data if row[target] == 1]

    # Create traces
    trace0 = go.Bar(x=[row[feature] for row in data if row[target] == 0], y=data_0, name='0', marker_color='red')
    trace1 = go.Bar(x=[row[feature] for row in data if row[target] == 1], y=data_1, name='1', marker_color='blue')

    # Create layout
    layout = go.Layout(barmode='stack', title='Percentage Distribution of ' + feature + ' vs ' + target)

    # Create figure and add traces
    fig = go.Figure(data=[trace0, trace1], layout=layout)

    # Plot
    fig.show()

def features_vs_target(df, features):
    target = "MIS_Status"
    num_cols = 4
    # Calculate the number of rows needed for the grid
    num_rows = (len(features) // num_cols) + 1  # Ceiling division to get the number of rows needed

    # Create a subplot grid with four columns
    fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=[f"{feat} vs {target}" for feat in features])

    # Initialize row and col counters
    row_idx = 1
    col_idx = 1

    for feature in features:
        # Calculate the percentage of points for each unique value of each feature compared to MIS_Status
        percentage_df = df.groupBy(feature, target).agg((count("*") / df.count()).alias("Percentage"))

        # Round percentage values to two decimal places
        percentage_df = percentage_df.withColumn("Percentage", round(col("Percentage") * 100, 2))

        # Convert DataFrame to list
        data = percentage_df.collect()

        # Separate data by target
        data_0 = [row['Percentage'] for row in data if row[target] == 0]
        data_1 = [row['Percentage'] for row in data if row[target] == 1]

        # Create traces for the current feature
        trace0 = go.Bar(x=[row[feature] for row in data if row[target] == 0], y=data_0, name='0', marker_color='red')
        trace1 = go.Bar(x=[row[feature] for row in data if row[target] == 1], y=data_1, name='1', marker_color='blue')

        # Add traces to the subplot
        fig.add_trace(trace0, row=row_idx, col=col_idx)
        fig.add_trace(trace1, row=row_idx, col=col_idx)

        # Move to the next cell
        col_idx += 1
        if col_idx > num_cols:
            col_idx = 1
            row_idx += 1

    # Update layout
    fig.update_layout(height=600*num_rows, title_text=f"Percentage Distribution of Features vs {target}", showlegend=False)

    # Plot
    fig.show()

In [9]:

# spark=SparkSession.builder\
#     .master("local[*]")\
#     .appName("LoanApproval")\
#     .getOrCreate()
spark=SparkSession.builder\
    .appName("LoanApproval")\
    .getOrCreate()


In [10]:

sc=spark.sparkContext


 ## Read Data - SBAnational.csv

In [11]:

data_path="../data/preprocessed.csv"

In [12]:

loan_df =  spark.read.csv(data_path, header=True, inferSchema=True, quote='"', escape='"', multiLine=True)


In [13]:

features = loan_df.columns
features_vs_target(loan_df, features)
