# Importation des librairies et du dataset

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
# Initialisation du Dataframe
mushroom_df = pd.read_csv("./mushrooms.csv")

In [8]:
mushroom_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# Démarage de PySpark

In [9]:
spark = SparkSession.builder.master("local[2]").appName("mushroom_classification").getOrCreate()

In [10]:
spark

In [11]:
sc = spark.sparkContext
sc

In [12]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext



<pyspark.sql.context.SQLContext at 0x2a28bb41dc0>

# Chargement de la data dans PySpark

In [13]:
MUSHROOM_DATA = "./mushrooms.csv"

# Attribute Information:
# 	- classes: edible=e,  poisonous=p
# 	- cap-shape: bell=b, conical=c, convex=x, flat=f,  knobbed=k, sunken=s
# 	- cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s
# 	- cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y
# 	- bruises: bruises=t, no=f
# 	- odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s
# 	- gill-attachment: attached=a, descending=d, free=f, notched=n
# 	- gill-spacing: close=c, crowded=w, distant=d
# 	- gill-size: broad=b, narrow=n
# 	- gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g,  green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y
# 	- stalk-shape: enlarging=e, tapering=t
# 	- stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?
# 	- stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s
# 	- stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s
# 	- stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
# 	- stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
# 	- veil-type: partial=p, universal=u
# 	- veil-color: brown=n, orange=o, white=w, yellow=y
# 	- ring-number: none=n, one=o, two=t
# 	- ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z
# 	- spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y
# 	- population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y
# 	- habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d

In [None]:
# Définition du schéma de la table
schema = schema = StructType([
    StructField("class", CharType(), nullable=True),
    StructField("cap-shape", CharType(), nullable=True),
    StructField("cap-surface", CharType(), nullable=True),
    StructField("cap-color", CharType(), nullable=True),
    StructField("odor", CharType(), nullable=True),
    StructField("gill-attachment", CharType(), nullable=True),
    StructField("gill-spacing", CharType(), nullable=True),
    StructField("gill-size", CharType(), nullable=True),
    StructField("gill-color", CharType(), nullable=True),
    StructField("stalk-shape", CharType(), nullable=True),
    StructField("stalk-root", CharType(), nullable=True),
    StructField("stalk-surface-above-ring", CharType(), nullable=True),
    StructField("stalk-surface-below-ring", CharType(), nullable=True),
    StructField("stalk-color-above-ring", CharType(), nullable=True),
    StructField("stalk-color-below-ring", CharType(), nullable=True),
    StructField("veil-type", CharType(), nullable=True),
    StructField("veil-color", CharType(), nullable=True),
    StructField("ring-number", CharType(), nullable=True),
    StructField("ring-type", CharType(), nullable=True),
    StructField("spore-print-color", CharType(), nullable=True),
    StructField("population", CharType(), nullable=True),
    StructField("habitat", CharType(), nullable=True)
])