In [9]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Our task is to develop a regression model that will predict the number of  crew members required for future ships from the given features. 

In [1]:
import findspark 
import numpy 
import pandas
findspark.init()
import pyspark
from  pyspark.sql import SparkSession 
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### Read the data Crew.csv into spark dataframe
- inferSchema=True and header=True.
- Print the schema and show the first few rows.
- Use df.describe() to see the statistical properties of the data.

In [2]:
data = spark.read.format("csv").\
option("inferschema","true").\
option("header" ,"true").\
load("Crew.csv")

In [3]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
data.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|     null|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

In [40]:
data.sort("ship_name").show(10)

+----------+----------------+---+-------+----------+------+------+-----------------+-----+
| Ship_name|     Cruise_line|Age|Tonnage|passengers|length|cabins|passenger_density| crew|
+----------+----------------+---+-------+----------+------+------+-----------------+-----+
| Adventure| Royal_Caribbean| 12|  138.0|     31.14|  10.2| 15.57|            44.32|11.85|
|   Allegra|           Costa| 21|  28.43|      8.08|  6.16|   4.1|            35.19|  4.0|
| Amsterdam|Holland_American| 13|   61.0|      13.8|   7.8|  6.88|             44.2|  6.0|
|   Arcadia|             P&O|  9|   85.0|     19.68|  9.35|  9.84|            43.19| 8.69|
|     Aries|            Star| 22|  3.341|      0.66|   2.8|  0.33|            50.62| 0.59|
|   Armonia|             MSC| 12|   58.6|     15.66|  8.24|  7.83|            37.42|  7.0|
|   Artemis|             P&O| 29|   45.0|     11.78|  7.54|   5.3|             38.2|  5.2|
| Atlantica|           Costa| 13| 85.619|     21.14|  9.57| 10.56|             40.5|  9.2|

### StringIndexer and OneHotEncoder 
- Create StringIndexer and OneHotEncoder to process the data.
- StringIndexer is for any string data type.
- OneHotEncoder will be applied to the StringIndexer columns.
- Convert all obtained columns from OneHotEncoder and the other numeric columns into a feature column (use VectorAssembler) 

In [11]:
data_types = data.dtypes
data_types

[('Ship_name', 'string'),
 ('Cruise_line', 'string'),
 ('Age', 'int'),
 ('Tonnage', 'double'),
 ('passengers', 'double'),
 ('length', 'double'),
 ('cabins', 'double'),
 ('passenger_density', 'double'),
 ('crew', 'double')]

In [17]:
cat_col = [v for (v,t) in data_types if t == "string" ]
cat_col 

['Ship_name', 'Cruise_line']

In [18]:
cat_col_indx = [s+'_index' for s in cat_col]
cat_col_indx

['Ship_name_index', 'Cruise_line_index']

In [19]:
cat_col_OHE = [s+'_OHE' for s in cat_col]
cat_col_OHE

['Ship_name_OHE', 'Cruise_line_OHE']

In [20]:
num_col = [v for (v,t) in data_types if (t == "double") & (v != "crew") ]
num_col

['Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']

In [23]:
AllDataCol = num_col + cat_col_OHE
AllDataCol

['Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Ship_name_OHE',
 'Cruise_line_OHE']

### Divide the data into Train/Test

In [24]:
trainDF,testDf = data.randomSplit([0.8,0.2],seed=42)

### Create a Linear Regression Model 

In [25]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [28]:
strIndx = StringIndexer(inputCols=cat_col,outputCols=cat_col_indx,handleInvalid='skip')

ohe = OneHotEncoder(inputCols=cat_col_indx,outputCols=cat_col_OHE)

vecAss = VectorAssembler(inputCols=AllDataCol,outputCol='features')

lr = LinearRegression(featuresCol='features',labelCol='crew',predictionCol='prediction')

### Create a Pipeline model

In [29]:
pl = Pipeline(stages=[strIndx,ohe,vecAss,lr])


### Fit the Pipeline model to the trainig data

In [34]:
plModel = pl.fit(trainDF)

In [35]:
plModel.write().overwrite().save('plModelReg')

Py4JJavaError: An error occurred while calling o724.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: Could not locate Hadoop executable: C:\spark\spark-3.4.0-bin-hadoop3\bin\winutils.exe -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.hadoop.mapred.FileOutputCommitter.setupJob(FileOutputCommitter.java:131)
	at org.apache.hadoop.mapred.OutputCommitter.setupJob(OutputCommitter.java:265)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:79)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1091)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1089)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1062)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1027)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1009)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1008)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:965)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1593)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1593)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1579)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1579)
	at org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:413)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.$anonfun$saveImpl$1(Pipeline.scala:250)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.$anonfun$saveImpl$1$adapted(Pipeline.scala:247)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.saveImpl(Pipeline.scala:247)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.saveImpl(Pipeline.scala:346)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:168)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent(events.scala:174)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent$(events.scala:169)
	at org.apache.spark.ml.util.Instrumentation.withSaveInstanceEvent(Instrumentation.scala:42)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$3(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$3$adapted(Pipeline.scala:344)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.save(Pipeline.scala:344)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.io.FileNotFoundException: Could not locate Hadoop executable: C:\spark\spark-3.4.0-bin-hadoop3\bin\winutils.exe -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getQualifiedBinInner(Shell.java:618)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:341)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:331)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:370)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:955)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:192)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:215)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1111)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1120)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)


### Make a prediction for the same training data and evaluate the model performance using RMSE and r2

In [43]:
pred = plModel.transform(trainDF)

In [44]:
pred.select("crew","prediction").show()

+-----+------------------+
| crew|        prediction|
+-----+------------------+
|11.85|11.841880037884803|
|  4.0| 4.001874983474391|
| 8.69| 8.697033614952687|
| 0.59|0.5874791415048413|
|  7.0| 7.006495363506672|
|  9.2| 9.199675984909963|
| 8.48| 8.473269309582985|
| 11.0|11.003333726785044|
|  6.7| 6.692131812096375|
| 8.58| 8.580507205296705|
|  2.1|2.0876456996437716|
| 19.1| 19.08980718717625|
| 9.99|  9.98992209136459|
|  9.0| 9.004201646754733|
|  4.7| 4.704188930155767|
| 11.0|10.847785565959772|
|  9.0| 9.158699357376708|
| 10.0|  9.99044597917515|
| 6.14| 6.143143696657559|
|  9.2| 9.191256741525603|
+-----+------------------+
only showing top 20 rows



In [46]:
from pyspark.ml.evaluation import RegressionEvaluator


In [51]:
regeval = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='rmse')
regeval.evaluate(pred)

0.08414002810710287

In [52]:

regeval = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='r2')
regeval.evaluate(pred)

0.9994359077926166

### Make a prediction for the test data and evaluate the model performance using RMSE and r2

In [31]:
pred = plModel.transform(testDf)


RMSE is 0.4725
r2 is 0.9870


In [53]:
remse = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='rmse')
r2 = RegressionEvaluator(predictionCol='prediction',labelCol='crew',metricName='r2')


In [54]:
print("rmse== ",remse.evaluate(pred))
print("r2== ",r2.evaluate(pred))

rmse==  0.08414002810710287
r2==  0.9994359077926166
