In [None]:
from optimus import Optimus
op = Optimus()

In [None]:
op.spark

In [None]:
op.sc

## Exploratory Data Analysis

In [None]:
df = op.load.csv("data/train.csv")

In [None]:
df.table()

In [None]:
op.profiler.missing_values(df, "*")

In [None]:
op.profiler.run(df, "*",infer=True)

In [None]:
df.plot.box("ApplicantIncome")

In [None]:
df.plot.hist("LoanAmount", buckets=100)

## Data cleansing

In [None]:
from pyspark.sql.functions import isnull, when, count, col

df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).table()

In [None]:
df.where(col("Gender").isNull()).table()

In [None]:
[row.Loan_ID for row in df.where(col("Gender").isNull()).select("Loan_ID").collect()]

In [None]:
df_imputed = df.cols.impute("Gender", data_type="categorical")

In [None]:
df_imputed.where(col("Loan_ID").isin(['LP001050',
 'LP001448',
 'LP001585',
 'LP001644',
 'LP002024',
 'LP002103',
 'LP002478',
 'LP002501',
 'LP002530',
 'LP002625',
 'LP002872',
 'LP002925',
 'LP002933'])).table()

## Data preparation

In [None]:
df = df.cols.impute("LoanAmount",data_type="continuous")

In [None]:
df = df.cols.fill_na("Self_Employed", "No")
df = df.cols.fill_na("*", 0)

In [None]:
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).table()

In [None]:
from optimus.ml import feature as fe

In [None]:
fe.string_to_index?

In [None]:
df_idx = fe.string_to_index(df, ['Gender','Married','Dependents','Education','Self_Employed','Property_Area'])

In [None]:
df_idx.table(10)

## Modeling

In [None]:
columns = ['Loan_Status','Credit_History','Gender_index','Married_index','Education_index']

In [None]:
df_model, dt_model = op.ml.decision_tree(df_idx, columns, "Loan_Status")

In [None]:
df_model.select("label","features","probability","prediction").show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
accuracy = evaluator.evaluate(df_model)

In [None]:
print(f'Accuracy = {accuracy}')