In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [3]:
data = spark.read.csv('/FileStore/tables/cruise_ship_info.csv', header = True, inferSchema = True)

In [4]:
data.show()

In [5]:
data.printSchema()

In [6]:
# transform categorical column into numberical column
from pyspark.ml.feature import StringIndexer

In [7]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol = 'Cruise_line_index')
indexerd = indexer.fit(data).transform(data)
indexerd.show()

In [8]:
#features cols from inputs
indexerd.columns

In [9]:
features = [ 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew', 'Cruise_line_index']

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
assembler = VectorAssembler(inputCols= features,
                           outputCol = 'features')

In [12]:
results = assembler.transform(indexerd)

In [13]:
results.show()

In [14]:
#build the model
final_data = results.select('features', 'crew')

In [15]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [16]:
train_data.describe().show()

In [17]:
test_data.describe().show()

In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
lr  = LinearRegression(labelCol = 'crew')

In [20]:
trained_ship_model = lr.fit(train_data)

In [21]:
ship_results = trained_ship_model.evaluate(test_data)

In [22]:
ship_results.rootMeanSquaredError

In [23]:
ship_results.r2

In [24]:
from pyspark.sql.functions import corr

In [25]:
#check correlation between two columns
data.select(corr('crew', 'cabins')).show()