# Árboles de Regresión

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("file:/home/cloudera/Documents/Ficheros de trabajo/bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

In [3]:
bd5.dtypes

[('Year', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('CRSDepTime', 'int'),
 ('UniqueCarrier', 'string'),
 ('TailNum', 'string'),
 ('ArrDelay', 'double'),
 ('DepDelay', 'double'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('Distance', 'double'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CarrierDelay', 'double'),
 ('WeatherDelay', 'double'),
 ('NASDelay', 'double'),
 ('SecurityDelay', 'double'),
 ('LateAircraftDelay', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [4]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='UniqueCarrier',outputCol='IndexUniqueCarrier') #el índice empieza en el 0!
bd6=indexer.fit(bd5).transform(bd5)

bd6.groupBy('UniqueCarrier','IndexUniqueCarrier').count().sort('IndexUniqueCarrier').show()


+-------------+------------------+-----+
|UniqueCarrier|IndexUniqueCarrier|count|
+-------------+------------------+-----+
|           AA|               0.0| 8853|
|           UA|               1.0| 6112|
|           WN|               2.0| 5395|
|           DL|               3.0| 4239|
|           VX|               4.0| 1703|
|           NK|               5.0| 1581|
|           F9|               6.0| 1295|
|           OO|               7.0| 1166|
|           B6|               8.0|  121|
|           EV|               9.0|    1|
+-------------+------------------+-----+



## Ajuste del modelo

In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','IndexUniqueCarrier'],
    outputCol='features')

bd7 = a1.transform(bd6).select(col("ArrDelay").alias("label"),'features')

### Partición Test - Train

In [6]:
(bd_train, bd_test) = bd7.randomSplit([0.7, 0.3],seed=123)
print(bd_train.count())
print(bd_test.count())

21353
9113


In [7]:
from pyspark.ml.regression import DecisionTreeRegressor as DTR

rt = DTR(maxDepth=5)

model = rt.fit(bd_train)
pred = model.transform(bd7)

In [8]:
pred.show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|-16.0|[0.0,1747.0,5.0,8...|-5.100687349061862|
| -9.0|[0.0,1747.0,5.0,1...|-5.100687349061862|
|-18.0|[-2.0,1747.0,5.0,...|-5.100687349061862|
|119.0|[130.0,628.0,5.0,...|             174.1|
|-18.0|[-8.0,628.0,5.0,2...|-13.17810650887574|
| -5.0|[2.0,628.0,5.0,73...|-5.100687349061862|
|  0.0|[11.0,628.0,5.0,1...| 3.532066508313539|
| -7.0|[-2.0,1199.0,6.0,...|-5.100687349061862|
|-21.0|[-3.0,1199.0,6.0,...|-9.127923246052369|
|-13.0|[-3.0,1747.0,6.0,...|-9.127923246052369|
|-15.0|[2.0,1747.0,6.0,9...|-5.100687349061862|
| -9.0|[-4.0,1946.0,6.0,...|-9.127923246052369|
|-14.0|[1.0,1946.0,6.0,1...|-5.100687349061862|
|-14.0|[6.0,1587.0,6.0,1...| 3.532066508313539|
| -7.0|[-4.0,1587.0,6.0,...|-9.127923246052369|
|  6.0|[18.0,1199.0,6.0,...|             13.47|
|-15.0|[0.0,1199.0,6.0,1...|-5.100687349061862|
| -8.0|[0.0,628.0,6.0,17...|-5.100687349

In [9]:
pred.groupBy('prediction').count().show(50)


+-------------------+-----+
|         prediction|count|
+-------------------+-----+
| 233.57407407407408|  233|
| 48.296740994854204|  800|
|  90.70588235294117|   50|
|0.17442845046570704| 1714|
|  65.28048780487805|  358|
| -5.100687349061862| 7669|
|               80.0|   15|
|  3.532066508313539| 2414|
|  77.50993377483444|  223|
| -9.127923246052369| 7137|
| 196.77884615384616|  280|
| 56.574074074074076|  241|
| 27.728183118741057|  991|
|  66.71052631578948|   52|
|  37.80821917808219|  815|
|              631.2|    6|
|  95.70833333333333|   65|
| 257.81481481481484|   49|
| 103.47058823529412|   25|
| 357.45454545454544|   14|
|  9.079545454545455|  606|
|              13.47| 2161|
| 102.55430711610487|  350|
|              174.1|  174|
| 104.63636363636364|   31|
| 133.38709677419354|   57|
|             259.75|   28|
| -13.17810650887574| 2417|
| 193.52727272727273|  161|
|   72.7872340425532|  356|
| 109.72200772200772|  360|
| 21.206572769953052|  614|
+-------------------

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
print(RegressionEvaluator(metricName="r2").evaluate(pred))

0.7230265506765332


## Tuneado de parámetros

In [11]:
# DecisionTreeRegressor(featuresCol="features", 
#    labelCol="label", 
#    predictionCol="prediction", 
#    maxDepth=5, 
#    maxBins=32, 
#    minInstancesPerNode=1, 
#    minInfoGain=0.0, 
#    maxMemoryInMB=256, 
#    impurity="variance")

In [12]:
rt = DTR(maxDepth=20,minInstancesPerNode=10,maxBins=50)
model = rt.fit(bd_train)
pred = model.transform(bd7)
print(RegressionEvaluator(metricName="r2").evaluate(pred))

0.7776534139022493


### Validación externa

In [13]:
pred2 = model.transform(bd_test)
print(RegressionEvaluator(metricName="r2").evaluate(pred2))

0.7155373704083114
