# Importation des librairies et du dataset

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

# Démarage de PySpark

In [9]:
spark = SparkSession.builder.master("local[2]").appName("mushroom_classification").getOrCreate()

In [10]:
spark

In [11]:
sc = spark.sparkContext
sc

In [12]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext



<pyspark.sql.context.SQLContext at 0x2a28bb41dc0>

# Chargement de la data dans PySpark

In [14]:
MUSHROOM_DATA = "./mushrooms.csv"

# Attribute Information:
# 	- classes: edible=e,  poisonous=p
# 	- cap-shape: bell=b, conical=c, convex=x, flat=f,  knobbed=k, sunken=s
# 	- cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s
# 	- cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y
# 	- bruises: bruises=t, no=f
# 	- odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s
# 	- gill-attachment: attached=a, descending=d, free=f, notched=n
# 	- gill-spacing: close=c, crowded=w, distant=d
# 	- gill-size: broad=b, narrow=n
# 	- gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g,  green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y
# 	- stalk-shape: enlarging=e, tapering=t
# 	- stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?
# 	- stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s
# 	- stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s
# 	- stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
# 	- stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
# 	- veil-type: partial=p, universal=u
# 	- veil-color: brown=n, orange=o, white=w, yellow=y
# 	- ring-number: none=n, one=o, two=t
# 	- ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z
# 	- spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y
# 	- population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y
# 	- habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d

In [22]:
# Définition du schéma de la table
schema = schema = StructType([
    StructField("class", StringType(), nullable=True),
    StructField("cap-shape", StringType(), nullable=True),
    StructField("cap-surface", StringType(), nullable=True),
    StructField("cap-color", StringType(), nullable=True),
    StructField("odor", StringType(), nullable=True),
    StructField("gill-attachment", StringType(), nullable=True),
    StructField("gill-spacing", StringType(), nullable=True),
    StructField("gill-size", StringType(), nullable=True),
    StructField("gill-color", StringType(), nullable=True),
    StructField("stalk-shape", StringType(), nullable=True),
    StructField("stalk-root", StringType(), nullable=True),
    StructField("stalk-surface-above-ring", StringType(), nullable=True),
    StructField("stalk-surface-below-ring", StringType(), nullable=True),
    StructField("stalk-color-above-ring", StringType(), nullable=True),
    StructField("stalk-color-below-ring", StringType(), nullable=True),
    StructField("veil-type", StringType(), nullable=True),
    StructField("veil-color", StringType(), nullable=True),
    StructField("ring-number", StringType(), nullable=True),
    StructField("ring-type", StringType(), nullable=True),
    StructField("spore-print-color", StringType(), nullable=True),
    StructField("population", StringType(), nullable=True),
    StructField("habitat", StringType(), nullable=True)
])

In [23]:
mushroom_df = spark.read.csv(path=MUSHROOM_DATA, schema=schema).cache()

In [24]:
mushroom_df.take(5)

[Row(class='class', cap-shape='cap-shape', cap-surface='cap-surface', cap-color='cap-color', odor='bruises', gill-attachment='odor', gill-spacing='gill-attachment', gill-size='gill-spacing', gill-color='gill-size', stalk-shape='gill-color', stalk-root='stalk-shape', stalk-surface-above-ring='stalk-root', stalk-surface-below-ring='stalk-surface-above-ring', stalk-color-above-ring='stalk-surface-below-ring', stalk-color-below-ring='stalk-color-above-ring', veil-type='stalk-color-below-ring', veil-color='veil-type', ring-number='veil-color', ring-type='ring-number', spore-print-color='ring-type', population='spore-print-color', habitat='population'),
 Row(class='p', cap-shape='x', cap-surface='s', cap-color='n', odor='t', gill-attachment='p', gill-spacing='f', gill-size='c', gill-color='n', stalk-shape='k', stalk-root='e', stalk-surface-above-ring='e', stalk-surface-below-ring='s', stalk-color-above-ring='s', stalk-color-below-ring='w', veil-type='w', veil-color='p', ring-number='w', ring

In [25]:
mushroom_df.show(5)

+-----+---------+-----------+---------+-------+---------------+---------------+------------+----------+-----------+-----------+------------------------+------------------------+----------------------+----------------------+--------------------+----------+-----------+-----------+-----------------+-----------------+----------+
|class|cap-shape|cap-surface|cap-color|   odor|gill-attachment|   gill-spacing|   gill-size|gill-color|stalk-shape| stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|           veil-type|veil-color|ring-number|  ring-type|spore-print-color|       population|   habitat|
+-----+---------+-----------+---------+-------+---------------+---------------+------------+----------+-----------+-----------+------------------------+------------------------+----------------------+----------------------+--------------------+----------+-----------+-----------+-----------------+-----------------+----------+
|class|cap-shape|ca