# Linear Regression Project

Project from the Udemy course "Spark and Python for Big Data with PySpark"

Aim: to give accurate estimates of how many crew members a ship will require.

Data available:

    Description: Measurements of ship size, capacity, crew, and age for 158 cruise
    ships.

    Variables/Columns
    Ship Name     1-20
    Cruise Line   21-40
    Age (as of 2013)   46-48
    Tonnage (1000s of tons)   50-56
    passengers (100s)   58-64
    Length (100s of feet)  66-72
    Cabins  (100s)   74-80
    Passenger Density   82-88
    Crew  (100s)   90-96

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [4]:
data = spark.read.csv("/FileStore/tables/cruise_ship_info.csv",inferSchema=True,header=True)

In [5]:
data.show()

In [6]:
data.columns

In [7]:
data.describe().show()

In [8]:
data.groupBy('Cruise_line').count().show()

In [9]:
from pyspark.ml.feature import StringIndexer

In [10]:
#Encoding categorical feature Cruise_line 
indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_lineIndex")
indexed = indexer.fit(data).transform(data)
indexed.show()

In [11]:
from pyspark.ml.regression import LinearRegression

In [12]:
#Setting up the dataframe for machine learning
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['Cruise_lineIndex', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'],
    outputCol='features')

In [13]:
output = assembler.transform(indexed)

In [14]:
output.head(1)

In [15]:
final_data = output.select(["features",'crew'])
final_data.show()

In [16]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [17]:
train_data.describe().show()

In [18]:
test_data.describe().show()

In [19]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='crew')

In [20]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [21]:
test_results = lrModel.evaluate(test_data)

In [22]:
test_results.residuals.show()

In [23]:
test_results.rootMeanSquaredError

In [24]:
test_results.r2

In [25]:
from pyspark.sql.functions import corr

In [26]:
data.select(corr('crew', 'passengers')).show()

In [27]:
data.select(corr('crew', 'cabins')).show()