In [1]:
# -*- coding: utf-8 -*-
"""
Make sure you give execute privileges
-----------------------------------------------------------------------------

           Spark with Python: Setup Spyder IDE for Spark

             Copyright : V2 Maestros @2016
                    
Execute this script once when Spyder is started on Windows
-----------------------------------------------------------------------------
"""

import os
import sys
os.chdir("D:/SPARK/Practice_Problems/Kaggle/Titanic/Data")
os.curdir

# Configure the environment. Set this up to the directory where
# Spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:/Spark/spark-1.6.0-bin-hadoop2.6'

# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exist. The names might change
#as versions change.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.9-src.zip"))

#Initiate Spark context. Once this is done all other applications can run
from pyspark import SparkContext
from pyspark import SparkConf

# Optionally configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "12g")
conf.set("spark.cores.max", "4")

conf.setAppName("ma")

## Initialize SparkContext. Run only once. Otherwise you get multiple 
#Context Error.
sc = SparkContext('local', conf=conf)


Load the data

In [77]:
#Load the CSV file into a RDD
titanicData = sc.textFile("train.csv")
titanicData.count()


892

Import pyspark.SQL

In [79]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import *
import collections
from pyspark.sql.types import *
sqlContext = SQLContext(sc)

Cleaning Header of the csv

In [80]:
header=titanicData.first()
header

u'PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked'

Split the header to get schema

In [91]:
fields = [StructField(field_name, StringType(), True) for field_name in header.split(',')]
fields

[StructField(PassengerId,StringType,true),
 StructField(Survived,StringType,true),
 StructField(Pclass,StringType,true),
 StructField(Sex,StringType,true),
 StructField(Age,StringType,true),
 StructField(SibSp,StringType,true),
 StructField(Parch,StringType,true),
 StructField(Fare,StringType,true),
 StructField(Embarked,StringType,true)]

Assign appropriate datatype

In [143]:
fields[0].dataType = IntegerType()
fields[1].dataType = IntegerType()
fields[2].dataType = IntegerType()
fields[3].dataType = StringType()
fields[4].dataType = FloatType()
fields[5].dataType = IntegerType()
fields[6].dataType = IntegerType()
fields[7].dataType = FloatType()
fields[8].dataType = StringType()
fields

[StructField(PassengerId,IntegerType,true),
 StructField(Survived,IntegerType,true),
 StructField(Pclass,IntegerType,true),
 StructField(Sex,StringType,true),
 StructField(Age,FloatType,true),
 StructField(SibSp,IntegerType,true),
 StructField(Parch,IntegerType,true),
 StructField(Fare,FloatType,true),
 StructField(Embarked,StringType,true)]

We can construct our schema, which we will use later below for building the data frame

In [93]:
schema = StructType(fields)

Remove the header using filter

In [94]:
titanicFile=titanicData.filter(lambda x: x!=header)
titanicFile.take(1)

[u'1,0,3,male,22,1,0,7.25,S']

In [95]:
titanic_temp = titanicFile.map(lambda k: k.split(",")).map(lambda p: (int(p[0]), int(p[1]), int(p[2]), p[3], float(p[4]) , int(p[5]), int(p[6]) ,float(p[7]), p[8]))
titanic_temp.top(2) 

[(891, 0, 3, u'male', 32.0, 0, 0, 7.75, u'Q'),
 (890, 1, 1, u'male', 26.0, 0, 0, 30.0, u'C')]

Assigning datatype through schema

In [96]:
titanicDF = sqlContext.createDataFrame(titanic_temp, schema)
titanicDF.head(10)

[Row(PassengerId=1, Survived=0, Pclass=3, Sex=u'male', Age=22.0, SibSp=1, Parch=0, Fare=7.25, Embarked=u'S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Sex=u'female', Age=38.0, SibSp=1, Parch=0, Fare=71.2833023071289, Embarked=u'C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Sex=u'female', Age=26.0, SibSp=0, Parch=0, Fare=7.925000190734863, Embarked=u'S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Sex=u'female', Age=35.0, SibSp=1, Parch=0, Fare=53.099998474121094, Embarked=u'S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Sex=u'male', Age=35.0, SibSp=0, Parch=0, Fare=8.050000190734863, Embarked=u'S'),
 Row(PassengerId=6, Survived=0, Pclass=3, Sex=u'male', Age=0.0, SibSp=0, Parch=0, Fare=8.45829963684082, Embarked=u'Q'),
 Row(PassengerId=7, Survived=0, Pclass=1, Sex=u'male', Age=54.0, SibSp=0, Parch=0, Fare=51.86249923706055, Embarked=u'S'),
 Row(PassengerId=8, Survived=0, Pclass=3, Sex=u'male', Age=2.0, SibSp=3, Parch=1, Fare=21.075000762939453, Embarked=u'S'),
 Row(PassengerId=9, Survi

Convert character into factor variables

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Sex", outputCol="SexInd")
indexed = indexer.fit(titanicDF).transform(titanicDF)
indexer1 = StringIndexer(inputCol="Embarked", outputCol="EmbInd")
indexed1 = indexer1.fit(indexed).transform(indexed)
indexed1.show()

In [111]:
from pyspark.mllib.regression import LabeledPoint

Drop character columns

In [122]:
drop_list = ['PassengerId','Sex','Embarked']

titanicDF1=indexed1.select([column for column in indexed1.columns if column not in drop_list])

titanicDF1.show()

+--------+------+----+-----+-----+-------+------+------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|SexInd|EmbInd|
+--------+------+----+-----+-----+-------+------+------+
|       0|     3|22.0|    1|    0|   7.25|   0.0|   0.0|
|       1|     1|38.0|    1|    0|71.2833|   1.0|   1.0|
|       1|     3|26.0|    0|    0|  7.925|   1.0|   0.0|
|       1|     1|35.0|    1|    0|   53.1|   1.0|   0.0|
|       0|     3|35.0|    0|    0|   8.05|   0.0|   0.0|
|       0|     3| 0.0|    0|    0| 8.4583|   0.0|   2.0|
|       0|     1|54.0|    0|    0|51.8625|   0.0|   0.0|
|       0|     3| 2.0|    3|    1| 21.075|   0.0|   0.0|
|       1|     3|27.0|    0|    2|11.1333|   1.0|   0.0|
|       1|     2|14.0|    1|    0|30.0708|   1.0|   1.0|
|       1|     3| 4.0|    1|    1|   16.7|   1.0|   0.0|
|       1|     1|58.0|    0|    0|  26.55|   1.0|   0.0|
|       0|     3|20.0|    0|    0|   8.05|   0.0|   0.0|
|       0|     3|39.0|    1|    5| 31.275|   0.0|   0.0|
|       0|     3|14.0|    0|   

Create label-features

In [129]:
def transformToLabeledPoint(inStr) :
    lp = ( int(inStr[0]), Vectors.dense([inStr[1],inStr[2],inStr[3],inStr[4],inStr[5],inStr[6],inStr[7]]))
    return lp
titanicDF2 = titanicDF1.map(transformToLabeledPoint)
titanicDF3 = sqlContext.createDataFrame(titanicDF2,["label", "features"])
titanicDF3.select("label","features").show(10)


+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[3.0,22.0,1.0,0.0...|
|    1|[1.0,38.0,1.0,0.0...|
|    1|[3.0,26.0,0.0,0.0...|
|    1|[1.0,35.0,1.0,0.0...|
|    0|[3.0,35.0,0.0,0.0...|
|    0|[3.0,0.0,0.0,0.0,...|
|    0|[1.0,54.0,0.0,0.0...|
|    0|[3.0,2.0,3.0,1.0,...|
|    1|[3.0,27.0,0.0,2.0...|
|    1|[2.0,14.0,1.0,0.0...|
+-----+--------------------+
only showing top 10 rows



Split the data into training and test

In [130]:
#Split into training and testing data
(trainingData, testData) = titanicDF3.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()
testData.collect()


[Row(label=0, features=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 0.0, 0.0, 0.0, 26.55, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 0.0, 0.0, 0.0, 30.6958, 0.0, 1.0])),
 Row(label=0, features=DenseVector([1.0, 0.0, 0.0, 0.0, 35.0, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 0.0, 0.0, 0.0, 42.4, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 0.0, 0.0, 0.0, 227.525, 0.0, 1.0])),
 Row(label=0, features=DenseVector([1.0, 18.0, 1.0, 0.0, 108.9, 0.0, 1.0])),
 Row(label=0, features=DenseVector([1.0, 28.0, 0.0, 0.0, 47.1, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 31.0, 0.0, 0.0, 50.4958, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 38.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 40.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 45.0, 0.0, 0.0, 35.5, 0.0, 0.0])),
 Row(label=0, features=DenseVector([1.0, 45.0, 1.0, 0.0, 83.475, 0.0, 0.0])),
 Row(lab

In [131]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


Convert the labels into index

In [133]:
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingData)
td = si_model.transform(trainingData)

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
sii_model = stringIndexer.fit(testData)
ttd = sii_model.transform(testData)


In [136]:
#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="features")
rmModel = rmClassifer.fit(td)


In [141]:
#Predict on the test data
predictions = rmModel.transform(ttd)
predictions.select("prediction","indexed","label","features").collect()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions) 

0.7904411764705882

In [142]:
#Draw a confusion matrix
labelList=predictions.select("indexed","label").distinct().toPandas()
predictions.groupBy("indexed","prediction").count().show()


+-------+----------+-----+
|indexed|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   61|
|    0.0|       0.0|  154|
|    0.0|       1.0|    5|
|    1.0|       0.0|   52|
+-------+----------+-----+



In [1]:
fuelData1 = sc.textFile("fuel_data_1.csv")
header1=fuelData1.first()
fields1= [StructField(field_name, StringType(), True) for field_name in header1.split(',')]
fields1[0].dataType = DateType()
fields1[1].dataType = FloatType()
fields1[2].dataType = FloatType()
fields1[3].dataType = FloatType()
fields1[4].dataType = FloatType()
fields1[5].dataType = FloatType()
fields1[6].dataType = FloatType()
fields1[7].dataType = FloatType()
fields1[8].dataType = FloatType()
fields1[9].dataType = FloatType()
schema1 = StructType(fields1)
fuelFile1=fuelData1.filter(lambda x: x!=header1)
def safe_cast(val, to_type, default=None):
    try:
        return to_type(val)
    except ValueError:
        return default
fueltemp1 = fuelFile1.map(lambda k: k.split(",")).map(lambda p: (parse(p[0]), safe_cast(p[1], float,0.0), safe_cast(p[2], float,0.0), safe_cast(p[3], float,0.0), safe_cast(p[4], float,0.0) , safe_cast(p[5], float,0.0), safe_cast(p[6], float,0.0) ,safe_cast(p[7], float,0.0), safe_cast(p[8], float,0.0), safe_cast(p[9], float,0.0)))
fuelDF1 = sqlContext.createDataFrame(fueltemp1, schema1)
fuelDF21 = fuelDF1.withColumn("File", lit(1))
fuelDF21.show(2)

NameError: name 'sc' is not defined