In [0]:
# File location and type
file_location = "/FileStore/tables/cruise_ship_info-2.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [0]:
df.describe().display()

summary,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,Infinity,,15.689873417721518,71.28467088607599,18.45740506329114,8.130632911392404,8.830000000000005,39.90094936708861,7.794177215189873
stddev,,,7.615691058751413,37.229540025907866,9.677094775143416,1.793473548054825,4.4714172221480615,8.63921711391542,3.503486564627034
min,Adventure,Azamara,4.0,2.329,0.66,2.79,0.33,17.7,0.59
max,Zuiderdam,Windstar,48.0,220.0,54.0,11.82,27.0,71.43,21.0


In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [0]:
df.columns

Out[10]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [0]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
si = StringIndexer(inputCol='Cruise_line',outputCol='cruise_cat')
si_1 = si.fit(df).transform(df)

In [0]:
si_1.head(2)

Out[18]: [Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0)]

In [0]:
si_1.columns

Out[20]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat']

In [0]:
vec = VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_cat'], outputCol='features')

In [0]:
re = vec.transform(si_1)

In [0]:
re.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- cruise_cat: double (nullable = false)
 |-- features: vector (nullable = true)



In [0]:
re_1 = re.select(['features','crew'])

In [0]:
re_1.head(2)

Out[29]: [Row(features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]), crew=3.55),
 Row(features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]), crew=3.55)]

In [0]:
lr = LinearRegression(labelCol='crew')

In [0]:
tr_dt,test_dt = re_1.randomSplit([0.7,0.3])

In [0]:
lo=lr.fit(tr_dt)

In [0]:
test_res = lo.evaluate(test_dt)

In [0]:
test_res.r2

Out[44]: 0.9544127303943774

In [0]:
test_res.rootMeanSquaredError

Out[47]: 0.6406152128499126

In [0]:
test_dt.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                53|
|   mean| 7.681509433962264|
| stddev|3.0290868109981814|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+

