In [None]:
#  Download the required Dataset for analysis
# ! wget http://stat-computing.org/dataexpo/2009/2007.csv.bz2
# ! http://stat-computing.org/dataexpo/2009/2008.csv.bz2
# ! wget https://github.com/jayyanar/MachineLearning_Workbook/blob/master/2007-ord-weather-data.csv --no-check-certificate
# ! wget https://github.com/jayyanar/MachineLearning_Workbook/blob/master/2008-ord-weather-data.csv --no-check-certificate
# ! bzip2 -d 2007.csv.bz2
# ! bzip2 -d 2008.csv.bz2

In [1]:
! ls -lrt


total 4415016
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users    569231 Mar 24  2013 joda-time-2.0.jar
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users 702878193 Aug 22  2014 2007.csv
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users 113753229 Dec  9  2014 2008.csv.bz2
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users 689413344 Dec  9  2014 2008.csv
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users    169922 Jul 31 04:40 2007-ord-weather-data.csv
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users    190795 Jul 31 04:41 2008-ord-weather-data.csv
drwx------ 2 s7ed-a18f3badb92bc2-a9f6794a31ec users      4096 Aug  2 03:08 joda
-rw------- 1 s7ed-a18f3badb92bc2-a9f6794a31ec users      1553 Aug  2 04:00 preprocess1.pig


In [2]:
# For SQL-type queries (Spark)
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *

# For regression and other possible ML tools (Spark)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics


# Important for managing features  (Spark)
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import VectorAssembler

# For displaying and other related IPython tools...
from IPython.display import display
from IPython.html.widgets import interact

# Typycal Python tools
import sys
import numpy as np
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import os.path



In [3]:
# ### Creating a SQL Dataframe from RDD
# 
# We now create a SQL DataFrame, this entity is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in Python, but with richer optimizations under the hood. We will utilize the recently created Spark RDD and use the Spark SQL context to create the desired data frame,

# We first create function that will allow to parse a record of our RDD into the desired format. As a reference we take a look at features_names and feature_example we just created above

def parse(r):
    try:
        x=Row(Month=int(r[1]),\
          DayofMonth=int(r[2]),\
          DayOfWeek=int(r[3]),\
          DepTime=int(float(r[4])), \
          CRSDepTime=int(r[5]),\
          DepDelay=int(float(r[15])),\
          Origin=r[16],\
          Dest=r[17], \
          Distance=int(float(r[18]))) 
    except:
        x=None  
    return x

# define hour function to obtain hour of day
def hour_ex(x): 
    h = int(str(int(x)).zfill(4)[:2])
    return h
# register as a UDF 
f = udf(hour_ex, IntegerType())


In [74]:
def prepFlightDelay(infile, orgn):
    textFile = sc.textFile(infile)
    textFileRDD = textFile.map(lambda x: x.split(','))
    header = textFileRDD.first()
    textRDD = textFileRDD.filter(lambda r: r != header)
    rowRDD = textRDD.map(lambda r: parse(r)).filter(lambda r:r != None)
    airline_df = sqlContext.createDataFrame(rowRDD)
    airline_df = airline_df.withColumn('hour', f(airline_df.CRSDepTime))
    airline_df.registerTempTable("airlineDF")
    airline_df_ORD = airline_df.filter((col("Origin") == orgn))
    airline_df_ORD_15 = airline_df_ORD.withColumn('DepDelayed', airline_df_ORD['DepDelay']>15)
    return airline_df_ORD_15

In [102]:
flight_2007_ORD = prepFlightDelay('2007.csv',"ORD")
flight_2007_ORD.take(5)

[Row(CRSDepTime=1100, DayOfWeek=4, DayofMonth=25, DepDelay=-8, DepTime=1052, Dest='EWR', Distance=719, Month=1, Origin='ORD', hour=11, DepDelayed=False),
 Row(CRSDepTime=1500, DayOfWeek=7, DayofMonth=28, DepDelay=41, DepTime=1541, Dest='IAH', Distance=925, Month=1, Origin='ORD', hour=15, DepDelayed=True),
 Row(CRSDepTime=2000, DayOfWeek=1, DayofMonth=29, DepDelay=45, DepTime=2045, Dest='CLE', Distance=316, Month=1, Origin='ORD', hour=20, DepDelayed=True),
 Row(CRSDepTime=1900, DayOfWeek=3, DayofMonth=17, DepDelay=-9, DepTime=1851, Dest='EWR', Distance=719, Month=1, Origin='ORD', hour=19, DepDelayed=False),
 Row(CRSDepTime=1745, DayOfWeek=5, DayofMonth=12, DepDelay=180, DepTime=2045, Dest='CLE', Distance=316, Month=1, Origin='ORD', hour=17, DepDelayed=True)]

In [112]:

print ("Actual Schema of the df")
flight_2007_ORD.printSchema()

for a_dftype in flight_2007_ORD.dtypes:
    #col_name = a_dftype[0]
    col_type = a_dftype[:1]
#     print df.select(col_name).collect()[0][0]
    
    if col_type == 'boolean' and (flight_2007_ORD.select("DepDelayed").distinct().collect()[0][0] =='False' or flight_2007_ORD.select("DepDelayed").distinct().collect()[0][0] =='True'):
        flight_2007_ORD = flight_2007_ORD.withColumn("DepDelayed",dflight_2007_ORD["DepDelayed"].cast("string).drop(flight_2007_ORD["DepDelayed"])

print ("Modified Schema of the df")
flight_2007_ORD.printSchema()

SyntaxError: invalid syntax (<ipython-input-112-179fd1989cd2>, line 11)

In [None]:
#df = sc.parallelize([(1, 'Y','F',"Giri",'Y'), (2, 'N','V',"Databricks",'N'),(3,'Y','B',"SparkEdge",'Y'),(4,'N','X',"Spark",'N')]).toDF(["id", "flag1","flag2","name","flag3"])
print ("Show Dataframe")
#df.show()
print ("Actual Schema of the df")
df.printSchema()

for a_dftype in df.dtypes:
    col_name = a_dftype[0]
    col_type = a_dftype[1]
#     print df.select(col_name).collect()[0][0]
    
    if col_type=='string' and (df.select(col_name).distinct().collect()[0][0] =='N' or df.select(col_name).distinct().collect()[0][0] =='Y'):
        df = df.withColumn(col_name,df[col_name].cast("boolean")).drop(df[col_name])
    else:
        df = df.withColumn(col_name,df[col_name]).drop(df[col_name])
print ("df with True/False Value after Data Type changes")
df.show()
print ("Modified Schema of the df")
df.printSchema()

In [103]:
#if flight_2007_ORD.select(col("DepDelayed")) == "False" 
#textFileRDD = textFile.map(lambda x: x.split(','))
#new_column_3 = coalesce((col("fruit1") == col("fruit2")).cast("int"), lit(3))
#flight_2007_ORD.coalesce((col("DepDelayed" == "True"), lit("Y"))).show()
#repl1 = flight_2007_ORD.withColumn("DepDelayed",lit("Y"))

#targetDf = df.withColumn("timestamp1", when(df["session"] == 0, 999).otherwise(df["timestamp1"]))
targetDf = flight_2007_ORD.withColumn("DepDelayed", when(flight_2007_ORD["DepDelayed"] == "True", "Y"))
targetDf.take(5)

[Row(CRSDepTime=1100, DayOfWeek=4, DayofMonth=25, DepDelay=-8, DepTime=1052, Dest='EWR', Distance=719, Month=1, Origin='ORD', hour=11, DepDelayed=None),
 Row(CRSDepTime=1500, DayOfWeek=7, DayofMonth=28, DepDelay=41, DepTime=1541, Dest='IAH', Distance=925, Month=1, Origin='ORD', hour=15, DepDelayed=None),
 Row(CRSDepTime=2000, DayOfWeek=1, DayofMonth=29, DepDelay=45, DepTime=2045, Dest='CLE', Distance=316, Month=1, Origin='ORD', hour=20, DepDelayed=None),
 Row(CRSDepTime=1900, DayOfWeek=3, DayofMonth=17, DepDelay=-9, DepTime=1851, Dest='EWR', Distance=719, Month=1, Origin='ORD', hour=19, DepDelayed=None),
 Row(CRSDepTime=1745, DayOfWeek=5, DayofMonth=12, DepDelay=180, DepTime=2045, Dest='CLE', Distance=316, Month=1, Origin='ORD', hour=17, DepDelayed=None)]

In [81]:
flight_2007_ORD.select("flight_2007_ORD.DepDelayed".as ("F"))

SyntaxError: invalid syntax (<ipython-input-81-08fc292432bb>, line 1)

## Create an Apache® Spark machine learning model

## Prepare data

In [39]:
split_data = flight_2007_ORD.randomSplit([0.8, 0.20], 24)
train_data = split_data[0]
test_data = split_data[1]


print ("Number of training records: " + str(train_data.count()))
print ("Number of testing records : " + str(test_data.count()))

Number of training records: 287074
Number of testing records : 72095


In [40]:
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler, Binarizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

In [59]:
#In the following step, convert all the string fields to numeric ones by using the StringIndexer transformer.
#stringIndexer_label = Binarizer(inputCol="DepDelayed", outputCol="Delayed")
stringIndexer_dest = StringIndexer(inputCol="Dest", outputCol="Destination").fit(flight_2007_ORD)
stringIndexer_org = StringIndexer(inputCol="Origin", outputCol="Orgin_Airport")

In [60]:
#In the following step, create a feature vector by combining all features together.
#[Row(CRSDepTime=1100, DayOfWeek=4, DayofMonth=25, DepDelay=-8, DepTime=1052, Dest='EWR', Distance=719, Month=1, Origin='ORD', hour=11, DepDelayed=False),
vectorAssembler_features = VectorAssembler(inputCols=["CRSDepTime","DayOfWeek","DayofMonth","DepDelay","DepTime","Destination","Distance","Month","Orgin_Airport"], outputCol="features")

In [61]:
rf = RandomForestClassifier(labelCol="DepDelayed", featuresCol="features")

In [62]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=stringIndexer_dest.labels)
print (flight_2007_ORD)

DataFrame[CRSDepTime: bigint, DayOfWeek: bigint, DayofMonth: bigint, DepDelay: bigint, DepTime: bigint, Dest: string, Distance: bigint, Month: bigint, Origin: string, hour: int, DepDelayed: boolean]


In [65]:
transform_df_pipeline = Pipeline(stages=[stringIndexer_dest, stringIndexer_org, vectorAssembler_features])
transformed_df = transform_df_pipeline.fit(flight_2007_ORD).transform(flight_2007_ORD)
transformed_df.show()

+----------+---------+----------+--------+-------+----+--------+-----+------+----+----------+-----------+-------------+--------------------+
|CRSDepTime|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Month|Origin|hour|DepDelayed|Destination|Orgin_Airport|            features|
+----------+---------+----------+--------+-------+----+--------+-----+------+----+----------+-----------+-------------+--------------------+
|      1100|        4|        25|      -8|   1052| EWR|     719|    1|   ORD|  11|     false|        6.0|          0.0|[1100.0,4.0,25.0,...|
|      1500|        7|        28|      41|   1541| IAH|     925|    1|   ORD|  15|      true|       16.0|          0.0|[1500.0,7.0,28.0,...|
|      2000|        1|        29|      45|   2045| CLE|     316|    1|   ORD|  20|      true|       14.0|          0.0|[2000.0,1.0,29.0,...|
|      1900|        3|        17|      -9|   1851| EWR|     719|    1|   ORD|  19|     false|        6.0|          0.0|[1900.0,3.0,17.0,...|
|      1745| 

In [66]:
#pipeline_rf = Pipeline(stages=[stringIndexer_label, stringIndexer_sex, stringIndexer_famhist, stringIndexer_smoker, vectorAssembler_features, rf, labelConverter])

pipeline_rf = Pipeline(stages=[stringIndexer_dest, stringIndexer_org, vectorAssembler_features, rf, labelConverter])


In [67]:
model_rf = pipeline_rf.fit(train_data)

IllegalArgumentException: 'requirement failed: Column DepDelayed must be of type NumericType but was actually of type BooleanType.'

In [68]:
predictions = model_rf.transform(test_data)
evaluatorRF = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluatorRF.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

NameError: name 'model_rf' is not defined

In [None]:
#
# Python UDFs for our PIG script
#
from datetime import date

# this array defines the dates of holiday in 2007 and 2008
holidays = [
        date(2007, 1, 1), date(2007, 1, 15), date(2007, 2, 19), date(2007, 5, 28), date(2007, 6, 7), date(2007, 7, 4), \
        date(2007, 9, 3), date(2007, 10, 8), date(2007, 11, 11), date(2007, 11, 22), date(2007, 12, 25), \
        date(2008, 1, 1), date(2008, 1, 21), date(2008, 2, 18), date(2008, 5, 22), date(2008, 5, 26), date(2008, 7, 4), \
        date(2008, 9, 1), date(2008, 10, 13), date(2008, 11, 11), date(2008, 11, 27), date(2008, 12, 25) \
     ]
# get number of days from nearest holiday
def days_from_nearest_holiday(year, month, day):
    d = date(year, month, day)
    x = [(abs(d-h)).days for h in holidays]
    return min(x)
def to_date(year, month, day):
    td = date(year, month, day)
    return td

In [69]:
! ls -lrta

total 4415016
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users    569231 Mar 24  2013 joda-time-2.0.jar
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users 702878193 Aug 22  2014 2007.csv
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users 113753229 Dec  9  2014 2008.csv.bz2
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users 689413344 Dec  9  2014 2008.csv
drwx------ 11 s7ed-a18f3badb92bc2-a9f6794a31ec users      4096 Jul 20 06:55 ..
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users    169922 Jul 31 04:40 2007-ord-weather-data.csv
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users    190795 Jul 31 04:41 2008-ord-weather-data.csv
drwx------  2 s7ed-a18f3badb92bc2-a9f6794a31ec users      4096 Aug  2 03:08 joda
drwx------  3 s7ed-a18f3badb92bc2-a9f6794a31ec users      4096 Aug  2 04:00 .
-rw-------  1 s7ed-a18f3badb92bc2-a9f6794a31ec users      1553 Aug  2 04:00 preprocess1.pig
