In [1]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=1f48b0e377b60f42b4412777af4fba656dd71446f6b6d225731e829424ce08cd
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


# Logistic Regression Code Along
This is a code along of the famous titanic dataset, its always nice to start off with this dataset because it is an example you will find across pretty much every data analysis language.

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('myproj').getOrCreate()

In [4]:
df=spark.read.csv('/content/sample_data/titanic.csv',inferSchema=True,header=True)

In [6]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
mycols=df.select(['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'])

In [9]:
my_final_data=mycols.na.drop()

## Working with Categorical Columns
Let's break this down into multiple steps to make it all clear.

In [10]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [14]:
gender_indexer=StringIndexer(inputCol='Sex',outputCol='SexIndex')
#A B C
#0 1 2
#ONE HOT ENCODER
#KEY A  B C
#Example A
#[1 , 0, 0]
gender_encoder=OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [15]:
embark_indexer=StringIndexer(inputCol='Embarked',outputCol='EmbarkedIndex')
embark_encoder=OneHotEncoder(inputCol='EmbarkedIndex',outputCol='EmbarkedVec')

In [24]:
assembler=VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'EmbarkedVec',
 'Age',
 'SibSp',
 'Parch',
 'Fare'
 ],outputCol='features')

In [25]:
from pyspark.ml.classification import LogisticRegression

## Pipelines
Let's see an example of how to use pipelines (we'll get a lot more practice with these later!)

In [26]:
from pyspark.ml import Pipeline

In [27]:
logreg_titanic=LogisticRegression(featuresCol='features',labelCol='Survived')

In [28]:
pipeline=Pipeline(stages=[gender_indexer,embark_indexer,
                          gender_encoder,embark_encoder,
                          assembler,logreg_titanic])

In [29]:
train_data,test_data=my_final_data.randomSplit([0.7,0.3])

In [30]:
fit_model=pipeline.fit(train_data)

In [31]:
results=fit_model.transform(test_data)

In [32]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [33]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [34]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [36]:
AUC=my_eval.evaluate(results)

In [37]:
AUC

0.7720588235294118