In [1]:
import wrangle
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
df = wrangle.wrangle_311(spark)

[wrangle.py] reading case.csv
[wrangle.py] handling data types
[wrangle.py] parsing dates
[wrangle.py] adding features
[wrangle.py] joining departments


In [3]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [4]:
train.show(1, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 005                  
 num_weeks_late       | -142.6441088         
 zipcode              | 78207                
 case_age             | 219                  
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Animal Care Services 
 dept_subject_to_SLA  | true                 
only showing top 1 row



In [5]:
# train.groupBy('case_age').count().show()

In [6]:
rf = pyspark.ml.feature.RFormula(
        formula='case_late ~ department + council_district')\
        .fit(train)

In [7]:
train_input = rf.transform(train)
# train_input.show(1, vertical=True, truncate=False)

# Classification Models

In [10]:
clas_model = pyspark.ml.classification.\
    LogisticRegression()
clas_model_fit = clas_model.fit(train_input)

clas_model_fit.summary.areaUnderROC

# evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
# test_auc = evaluator.evaluate(
#                     clas_model_fit.transform(rf.transform(test)))
# test_auc

0.6372537251671418

In [12]:
clas_model = pyspark.ml.classification\
    .DecisionTreeClassifier(maxDepth=5)
clas_model_fit = clas_model.fit(train_input)

evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(
                    clas_model_fit.transform(rf.transform(test)))
test_auc

0.49075879297284075

In [13]:
clas_model = pyspark.ml.classification\
    .RandomForestClassifier(numTrees=6)
clas_model_fit = clas_model.fit(train_input)

evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(
                    clas_model_fit.transform(rf.transform(test)))
test_auc

0.6145379864040053

In [14]:
clas_model = pyspark.ml.classification.NaiveBayes()
clas_model_fit = clas_model.fit(train_input)

evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(
                    clas_model_fit.transform(rf.transform(test)))
test_auc

0.5038396786125383

# Regression Models

In [15]:
lr = pyspark.ml.regression.LinearRegression()
lr_fit = lr.fit(train_input)
test_input = rf.transform(test)

evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

0.31038396099106197

In [16]:
lr = pyspark.ml.regression.DecisionTreeRegressor()
lr_fit = lr.fit(train_input)
test_input = rf.transform(test)

evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

0.3103850125711548

In [19]:
lr = pyspark.ml.regression.RandomForestRegressor(numTrees=12)
lr_fit = lr.fit(train_input)
test_input = rf.transform(test)

evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

0.31042715276654625