In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.sql(''' select 'spark' as hello ''')

df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc =SparkContext.getOrCreate()

import random


NUM_SAMPLE = 100000000

def inside(p):

    x, y = random.random(), random.random()

    return x*x + y*y < 1

count = sc.parallelize(range(0, NUM_SAMPLE)).filter(inside).count()

pi = 4*count / NUM_SAMPLE

print("Pi is roughly:", pi)


Pi is roughly: 3.14150708


In [3]:
from pyspark.sql.types import *


In [4]:
spark = SparkSession.builder.appName('iris_clf').getOrCreate()

In [5]:
df = spark.read.csv('Iris.csv',header=True, inferSchema=True)
df.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [7]:
schema = StructType( [
                        StructField('sepal_length', DoubleType()),
                        StructField('sepal_width', DoubleType()),
                        StructField('petal_length', DoubleType()),
                        StructField('petal_width', DoubleType()),
                        StructField('type', StringType()),
])

In [8]:
df2 = spark.read.csv('Iris.csv', header=True, schema= schema)
df2.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- type: string (nullable = true)



In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
input_col = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
vectorizer = VectorAssembler(inputCols= input_col, outputCol='features')

df = vectorizer.transform(df)

df.show(5)

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+---+-------------+------------+-------------+------------+-----------+-----------------+
only showing top 5 rows



In [11]:
from pyspark.ml.feature import StringIndexer

In [13]:
indexer = StringIndexer(inputCol='Species', outputCol='indexed_type')

df = indexer.fit(df).transform(df)

df.show(5)


+---+-------------+------------+-------------+------------+-----------+-----------------+------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|indexed_type|
+---+-------------+------------+-------------+------------+-----------+-----------------+------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|         0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|         0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|         0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|         0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|         0.0|
+---+-------------+------------+-------------+------------+-----------+-----------------+------------+
only showing top 5 rows

