In [9]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Our task is to develop a regression model that will predict the number of  crew members required for future ships from the given features. 

In [1]:
import findspark 
import numpy 
import pandas
findspark.init()
import pyspark
from  pyspark.sql import SparkSession 
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### Read the data Crew.csv into spark dataframe
- inferSchema=True and header=True.
- Print the schema and show the first few rows.
- Use df.describe() to see the statistical properties of the data.

In [2]:
data = spark.read.format("csv").\
option("inferschema","true").\
option("header" ,"true").\
load("Crew.csv")

In [3]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
data.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|     null|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

In [40]:
data.sort("ship_name").show(10)

+----------+----------------+---+-------+----------+------+------+-----------------+-----+
| Ship_name|     Cruise_line|Age|Tonnage|passengers|length|cabins|passenger_density| crew|
+----------+----------------+---+-------+----------+------+------+-----------------+-----+
| Adventure| Royal_Caribbean| 12|  138.0|     31.14|  10.2| 15.57|            44.32|11.85|
|   Allegra|           Costa| 21|  28.43|      8.08|  6.16|   4.1|            35.19|  4.0|
| Amsterdam|Holland_American| 13|   61.0|      13.8|   7.8|  6.88|             44.2|  6.0|
|   Arcadia|             P&O|  9|   85.0|     19.68|  9.35|  9.84|            43.19| 8.69|
|     Aries|            Star| 22|  3.341|      0.66|   2.8|  0.33|            50.62| 0.59|
|   Armonia|             MSC| 12|   58.6|     15.66|  8.24|  7.83|            37.42|  7.0|
|   Artemis|             P&O| 29|   45.0|     11.78|  7.54|   5.3|             38.2|  5.2|
| Atlantica|           Costa| 13| 85.619|     21.14|  9.57| 10.56|             40.5|  9.2|

### StringIndexer and OneHotEncoder 
- Create StringIndexer and OneHotEncoder to process the data.
- StringIndexer is for any string data type.
- OneHotEncoder will be applied to the StringIndexer columns.
- Convert all obtained columns from OneHotEncoder and the other numeric columns into a feature column (use VectorAssembler) 

In [11]:
data_types = data.dtypes
data_types

[('Ship_name', 'string'),
 ('Cruise_line', 'string'),
 ('Age', 'int'),
 ('Tonnage', 'double'),
 ('passengers', 'double'),
 ('length', 'double'),
 ('cabins', 'double'),
 ('passenger_density', 'double'),
 ('crew', 'double')]

In [17]:
cat_col = [v for (v,t) in data_types if t == "string" ]
cat_col 

['Ship_name', 'Cruise_line']

In [18]:
cat_col_indx = [s+'_index' for s in cat_col]
cat_col_indx

['Ship_name_index', 'Cruise_line_index']

In [19]:
cat_col_OHE = [s+'_OHE' for s in cat_col]
cat_col_OHE

['Ship_name_OHE', 'Cruise_line_OHE']

In [20]:
num_col = [v for (v,t) in data_types if (t == "double") & (v != "crew") ]
num_col

['Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']

In [23]:
AllDataCol = num_col + cat_col_OHE
AllDataCol

['Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Ship_name_OHE',
 'Cruise_line_OHE']

### Divide the data into Train/Test

In [24]:
trainDF,testDf = data.randomSplit([0.8,0.2],seed=42)

### Create a Linear Regression Model 

In [25]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [28]:
strIndx = StringIndexer(inputCols=cat_col,outputCols=cat_col_indx,handleInvalid='skip')

ohe = OneHotEncoder(inputCols=cat_col_indx,outputCols=cat_col_OHE)

vecAss = VectorAssembler(inputCols=AllDataCol,outputCol='features')

lr = LinearRegression(featuresCol='features',labelCol='crew',predictionCol='prediction')

### Create a Pipeline model

In [29]:
pl = Pipeline(stages=[strIndx,ohe,vecAss,lr])


### Fit the Pipeline model to the trainig data

In [34]:
plModel = pl.fit(trainDF)

### Make a prediction for the same training data and evaluate the model performance using RMSE and r2

In [43]:
pred = plModel.transform(trainDF)

In [44]:
pred.select("crew","prediction").show()

+-----+------------------+
| crew|        prediction|
+-----+------------------+
|11.85|11.841880037884803|
|  4.0| 4.001874983474391|
| 8.69| 8.697033614952687|
| 0.59|0.5874791415048413|
|  7.0| 7.006495363506672|
|  9.2| 9.199675984909963|
| 8.48| 8.473269309582985|
| 11.0|11.003333726785044|
|  6.7| 6.692131812096375|
| 8.58| 8.580507205296705|
|  2.1|2.0876456996437716|
| 19.1| 19.08980718717625|
| 9.99|  9.98992209136459|
|  9.0| 9.004201646754733|
|  4.7| 4.704188930155767|
| 11.0|10.847785565959772|
|  9.0| 9.158699357376708|
| 10.0|  9.99044597917515|
| 6.14| 6.143143696657559|
|  9.2| 9.191256741525603|
+-----+------------------+
only showing top 20 rows



In [46]:
from pyspark.ml.evaluation import RegressionEvaluator


In [51]:
regeval = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='rmse')
regeval.evaluate(pred)

0.08414002810710287

In [52]:

regeval = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='r2')
regeval.evaluate(pred)

0.9994359077926166

### Make a prediction for the test data and evaluate the model performance using RMSE and r2

In [31]:
pred = plModel.transform(testDf)


RMSE is 0.4725
r2 is 0.9870


In [53]:
remse = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='rmse')
r2 = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='r2')


In [54]:
print("rmse== ",remse.evaluate(pread))
print("r2== ",r2.evaluate(pred))

rmse==  0.08414002810710287
r2==  0.9994359077926166
