In [1]:
import pyspark

## Spark DataFrame Basics

In [2]:
from pyspark.sql import SparkSession,SQLContext

In [3]:
spark = SparkSession.builder.appName('Basics')

In [4]:
spark = spark.getOrCreate()

In [5]:
df = spark.read.json('/home/ubuntu/pyspark/data/people.json')

In [6]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [23]:
df.columns

['age', 'name']

In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [7]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [8]:
appl_stock = spark.read.csv('/home/ubuntu/pyspark/data/appl_stock.csv')

In [9]:
appl_stock.show()

++
||
++
++



In [13]:
!df -h ~/

Filesystem      Size  Used Avail Use% Mounted on
/dev/xvda1      7.7G  3.3G  4.5G  43% /


In [10]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [12]:
!java -version

openjdk version "1.8.0_242"
OpenJDK Runtime Environment (build 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.242-b08, mixed mode)


In [11]:
from pyspark.sql.types import *


### Learning Apache 

#### https://runawayhorse001.github.io/LearningApacheSpark

## Programming with RDDs

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### Using parallelize( )

In [2]:
df = spark.sparkContext.parallelize([('1','Joe','70000','1'),('2', 'Henry', '80000', None)]).toDF(['Id', 'Name', 'Sallary','DepartmentId'])

In [3]:
df.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|        null|
+---+-----+-------+------------+



### Using createDataFrame( )

In [7]:
df = spark.createDataFrame([('1', 'Joe', '70000', '1'),
('2', 'Henry', '80000', None)],
['Id','Name','Sallary','DepartmentId'])

In [8]:
df.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|        null|
+---+-----+-------+------------+



In [10]:
df = spark.sparkContext.parallelize([(1, 2, 3, 'a b c'),
             (4, 5, 6, 'd e f'),
             (7, 8, 9, 'g h i')]).toDF(['col1', 'col2', 'col3','col4'])

In [12]:
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [20]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

myData = spark.sparkContext.parallelize([(1,2), (3,4), (5,6), (7,8), (9,10)])

In [22]:
myData.collect()

[(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)]

In [23]:
Employee = spark.createDataFrame([
                        ('1', 'Joe',   '70000', '1'),
                        ('2', 'Henry', '80000', '2'),
                        ('3', 'Sam',   '60000', '2'),
                        ('4', 'Max',   '90000', '1')],
                        ['Id', 'Name', 'Sallary','DepartmentId']
                       )

In [24]:
Employee.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|           2|
|  3|  Sam|  60000|           2|
|  4|  Max|  90000|           1|
+---+-----+-------+------------+



In [25]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').\
                load("/home/ubuntu/pyspark/data/Advertising.csv",header=True)

In [26]:
df.show()

+---+-----+-----+---------+-----+
|_c0|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
|  6|  8.7| 48.9|     75.0|  7.2|
|  7| 57.5| 32.8|     23.5| 11.8|
|  8|120.2| 19.6|     11.6| 13.2|
|  9|  8.6|  2.1|      1.0|  4.8|
| 10|199.8|  2.6|     21.2| 10.6|
| 11| 66.1|  5.8|     24.2|  8.6|
| 12|214.7| 24.0|      4.0| 17.4|
| 13| 23.8| 35.1|     65.9|  9.2|
| 14| 97.5|  7.6|      7.2|  9.7|
| 15|204.1| 32.9|     46.0| 19.0|
| 16|195.4| 47.7|     52.9| 22.4|
| 17| 67.8| 36.6|    114.0| 12.5|
| 18|281.4| 39.6|     55.8| 24.4|
| 19| 69.2| 20.5|     18.3| 11.3|
| 20|147.3| 23.9|     19.1| 14.6|
+---+-----+-----+---------+-----+
only showing top 20 rows



In [27]:
ds = spark.read.csv(path='/home/ubuntu/pyspark/data/Advertising.csv',header=True,
               inferSchema=True)

In [29]:
ds.show(5)

+---+-----+-----+---------+-----+
|_c0|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
+---+-----+-----+---------+-----+
only showing top 5 rows



In [31]:
ds.head(5)

[Row(_c0=1, TV=230.1, radio=37.8, newspaper=69.2, sales=22.1),
 Row(_c0=2, TV=44.5, radio=39.3, newspaper=45.1, sales=10.4),
 Row(_c0=3, TV=17.2, radio=45.9, newspaper=69.3, sales=9.3),
 Row(_c0=4, TV=151.5, radio=41.3, newspaper=58.5, sales=18.5),
 Row(_c0=5, TV=180.8, radio=10.8, newspaper=58.4, sales=12.9)]

In [39]:
ds.select(['_c0','TV']).collect()[0][0]

1

### Creating RDD sparkconf

In [1]:
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [6]:
sc = SparkContext('local','example')

In [7]:
data=sc.textFile("/home/ubuntu/pyspark/data/Advertising.csv")

In [4]:
data.first()

'Sno,TV,radio,newspaper,sales'

In [8]:
data_map=data.map(lambda s: len(s))

In [11]:
data_map.collect()[:5]

[28, 22, 21, 20, 22]

In [12]:
#data.map(lambda s: len(s)).reduce(lambda a, b: a + b)
data_map.reduce(lambda a, b: a + b)

4357

## Test Training cheatsheat sample

In [17]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Pyspark1").config("config.option","value").getOrCreate()

In [19]:
Adv_csv="/home/ubuntu/pyspark/data/Advertising.csv"
import csv
with open(Adv_csv) as f:
    adv_csv_file=f.readlines()
    csvout=csv.reader(adv_csv_file)
header=next(csvout)
rows=[row for row in csvout]

In [20]:
header

['Sno', 'TV', 'radio', 'newspaper', 'sales']

In [21]:
rows[0]

['1', '230.1', '37.8', '69.2', '22.1']

In [22]:
spark.sparkContext.parallelize(rows)

ParallelCollectionRDD[7] at parallelize at PythonRDD.scala:195

In [23]:
spark_rows = spark.sparkContext.parallelize(rows)
spark_rows.collect()[:2]

[['1', '230.1', '37.8', '69.2', '22.1'], ['2', '44.5', '39.3', '45.1', '10.4']]

### Creating RDD DataFrame

#### From List

In [16]:
### Getting list values from aAdvertising files
print("row list :",rows[0])
print("header list :",header)


row list : ['1', '230.1', '37.8', '69.2', '22.1']
header list : ['Sno', 'TV', 'radio', 'newspaper', 'sales']


In [17]:
df=spark.createDataFrame(rows,header)

In [18]:
df.show(5)

+---+-----+-----+---------+-----+
|Sno|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
+---+-----+-----+---------+-----+
only showing top 5 rows



In [19]:
df.describe()

DataFrame[summary: string, Sno: string, TV: string, radio: string, newspaper: string, sales: string]

In [20]:
df.printSchema()

root
 |-- Sno: string (nullable = true)
 |-- TV: string (nullable = true)
 |-- radio: string (nullable = true)
 |-- newspaper: string (nullable = true)
 |-- sales: string (nullable = true)



#### From Dict

In [21]:
d = {'A': [0, 1, 0],
     'B': [1, 0, 1],
     'C': [1, 0, 0]}

In [22]:
import numpy as np

In [23]:
np.array(list(d.values())).T.tolist(),list(d.keys())

([[0, 1, 1], [1, 0, 0], [0, 1, 0]], ['A', 'B', 'C'])

In [24]:
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  0|  1|  1|
|  1|  0|  0|
|  0|  1|  0|
+---+---+---+



### RDD Parallelized Collections

In [13]:
data_file=sc.textFile("/home/ubuntu/pyspark/data/Advertising.csv")

In [14]:
data_file.collect()[:2]

['Sno,TV,radio,newspaper,sales', '1,230.1,37.8,69.2,22.1']

In [15]:
data_file.map(lambda s:s).collect()[:3]

['Sno,TV,radio,newspaper,sales',
 '1,230.1,37.8,69.2,22.1',
 '2,44.5,39.3,45.1,10.4']

In [41]:
data_file.map(lambda s:len(s)).reduce(lambda a, b: a + b)

4357

In [45]:
data_file.map(lambda s:len(s)).persist()

PythonRDD[42] at RDD at PythonRDD.scala:53

#### Passing Functions to Spark

In [46]:
def func(word):
    wordlist = word.split(",")
    return len(wordlist)

data_file.map(func).collect()[:2]

[5, 5]

In [47]:
data_file.map(func).reduce(lambda a, b: a + b)

1005

### Shared Variables

#### Broadcast Variables

In [8]:
broadcastVar = sc.broadcast([1, 2, 3])

In [9]:
broadcastVar.value

[1, 2, 3]

In [10]:
broadcastVar.unpersist()

#### Accumulators

In [11]:
accum = sc.accumulator(0)
accum

Accumulator<id=0, value=0>

In [12]:
sc.parallelize([1, 2, 3, 4]).foreach(lambda x: accum.add(x))

In [16]:
accum.value

10

## Creating DataFrame pyspark

In [28]:
Adv_csv

'/home/ubuntu/pyspark/data/Advertising.csv'

In [32]:
pydata = spark.read.csv(Adv_csv,header=True)

In [34]:
pydata.show(4)

+---+-----+-----+---------+-----+
|Sno|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
+---+-----+-----+---------+-----+
only showing top 4 rows



In [36]:
pydata.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|               Sno|               TV|             radio|         newspaper|             sales|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|               200|              200|               200|               200|               200|
|   mean|             100.5|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|57.879184513951124|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|                 1|              0.7|                 0|               0.3|               1.6|
|    max|                99|             97.5|               9.9|               9.5|               9.9|
+-------+------------------+-----------------+------------------+------------------+------------------+



In [41]:
pydata.columns

['Sno', 'TV', 'radio', 'newspaper', 'sales']

In [47]:
pydata.toDF('sno', 'tv', 'radio', 'newspaper', 'sales').show(4)

+---+-----+-----+---------+-----+
|sno|   tv|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
+---+-----+-----+---------+-----+
only showing top 4 rows



In [45]:
pydata.columns

['Sno', 'TV', 'radio', 'newspaper', 'sales']

In [46]:
pydata.show(4)

+---+-----+-----+---------+-----+
|Sno|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
+---+-----+-----+---------+-----+
only showing top 4 rows



In [51]:
mapping = {'newspaper':'C','sales':'D'}

In [53]:
new_names = [mapping.get(col,col) for col in pydata.columns]
new_names

['Sno', 'TV', 'radio', 'C', 'D']

In [54]:
pydata.columns

['Sno', 'TV', 'radio', 'newspaper', 'sales']

In [55]:
pydata.withColumnRenamed('radio','Radio').show(2)

+---+-----+-----+---------+-----+
|Sno|   TV|Radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
+---+-----+-----+---------+-----+
only showing top 2 rows



In [57]:
drop_name = ['Newspaper','Sales']
pydata.drop(*drop_name).show(4)

+---+-----+-----+
|Sno|   TV|radio|
+---+-----+-----+
|  1|230.1| 37.8|
|  2| 44.5| 39.3|
|  3| 17.2| 45.9|
|  4|151.5| 41.3|
+---+-----+-----+
only showing top 4 rows



In [60]:
pydata[pydata.TV < 5 ].collect()

[Row(Sno='131', TV='0.7', radio='39.6', newspaper='8.7', sales='1.6'),
 Row(Sno='156', TV='4.1', radio='11.6', newspaper='5.7', sales='3.2')]

In [61]:
pydata[pydata.TV < 5 ].show()

+---+---+-----+---------+-----+
|Sno| TV|radio|newspaper|sales|
+---+---+-----+---------+-----+
|131|0.7| 39.6|      8.7|  1.6|
|156|4.1| 11.6|      5.7|  3.2|
+---+---+-----+---------+-----+



In [62]:
pydata[(pydata.TV < 15 ) & (pydata.radio < 30)].show() 

+---+----+-----+---------+-----+
|Sno|  TV|radio|newspaper|sales|
+---+----+-----+---------+-----+
|  9| 8.6|  2.1|        1|  4.8|
| 23|13.2| 15.9|     49.6|  5.6|
| 57| 7.3| 28.1|     41.4|  5.5|
| 79| 5.4| 29.9|      9.4|  5.3|
|109|13.1|  0.4|     25.6|  5.3|
|133| 8.4| 27.2|      2.1|  5.7|
|156| 4.1| 11.6|      5.7|  3.2|
+---+----+-----+---------+-----+



#### With New Column

In [70]:
# pydata.withColumn('tv_norm', int()).show(4)
from pyspark.sql.functions import sum

In [73]:
pydata.groupBy().agg(sum("TV")).collect()[0][0]

29408.499999999996

In [76]:
pydata.withColumn('tv_norm', pydata.TV/pydata.groupBy().agg(sum("TV")).collect()[0][0]).show(4)

+---+-----+-----+---------+-----+--------------------+
|Sno|   TV|radio|newspaper|sales|             tv_norm|
+---+-----+-----+---------+-----+--------------------+
|  1|230.1| 37.8|     69.2| 22.1|0.007824268493802813|
|  2| 44.5| 39.3|     45.1| 10.4|0.001513167961643...|
|  3| 17.2| 45.9|     69.3|  9.3|5.848649200061207E-4|
|  4|151.5| 41.3|     58.5| 18.5|0.005151571824472517|
+---+-----+-----+---------+-----+--------------------+
only showing top 4 rows



In [93]:
pydata.select(['TV']).collect()[0][0]

'230.1'

#### Join

In [94]:
import pandas as pd

In [95]:
leftp = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                    index=[0, 1, 2, 3])

rightp = pd.DataFrame({'A': ['A0', 'A1', 'A6', 'A7'],
                       'F': ['B4', 'B5', 'B6', 'B7'],
                       'G': ['C4', 'C5', 'C6', 'C7'],
                       'H': ['D4', 'D5', 'D6', 'D7']},
                       index=[4, 5, 6, 7])

print("Left table\n", leftp)
print("Right Table\n", rightp)

Left table
     A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2
3  A3  B3  C3  D3
Right Table
     A   F   G   H
4  A0  B4  C4  D4
5  A1  B5  C5  D5
6  A6  B6  C6  D6
7  A7  B7  C7  D7


In [96]:
lefts = spark.createDataFrame(leftp)
rights = spark.createDataFrame(rightp)

In [99]:
lefts.show()

+---+---+---+---+
|  A|  B|  C|  D|
+---+---+---+---+
| A0| B0| C0| D0|
| A1| B1| C1| D1|
| A2| B2| C2| D2|
| A3| B3| C3| D3|
+---+---+---+---+



In [100]:
rights.show()

+---+---+---+---+
|  A|  F|  G|  H|
+---+---+---+---+
| A0| B4| C4| D4|
| A1| B5| C5| D5|
| A6| B6| C6| D6|
| A7| B7| C7| D7|
+---+---+---+---+



In [101]:
leftp.merge(rightp,on='A',how='left') # pandas dataframe

Unnamed: 0,A,B,C,D,F,G,H
0,A0,B0,C0,D0,B4,C4,D4
1,A1,B1,C1,D1,B5,C5,D5
2,A2,B2,C2,D2,,,
3,A3,B3,C3,D3,,,


In [102]:
lefts.join(rights, on='A', how='left').show()

+---+---+---+---+----+----+----+
|  A|  B|  C|  D|   F|   G|   H|
+---+---+---+---+----+----+----+
| A2| B2| C2| D2|null|null|null|
| A0| B0| C0| D0|  B4|  C4|  D4|
| A3| B3| C3| D3|null|null|null|
| A1| B1| C1| D1|  B5|  C5|  D5|
+---+---+---+---+----+----+----+



In [103]:
lefts.join(rights, on='A', how='left').orderBy('A',ascending=True).show()

+---+---+---+---+----+----+----+
|  A|  B|  C|  D|   F|   G|   H|
+---+---+---+---+----+----+----+
| A0| B0| C0| D0|  B4|  C4|  D4|
| A1| B1| C1| D1|  B5|  C5|  D5|
| A2| B2| C2| D2|null|null|null|
| A3| B3| C3| D3|null|null|null|
+---+---+---+---+----+----+----+



In [105]:
lefts.join(rights,on='A',how='right').orderBy('A',ascending=True).show()

+---+----+----+----+---+---+---+
|  A|   B|   C|   D|  F|  G|  H|
+---+----+----+----+---+---+---+
| A0|  B0|  C0|  D0| B4| C4| D4|
| A1|  B1|  C1|  D1| B5| C5| D5|
| A6|null|null|null| B6| C6| D6|
| A7|null|null|null| B7| C7| D7|
+---+----+----+----+---+---+---+



In [106]:
lefts.join(rights,on='A',how='inner').orderBy('A',ascending=True).show()

+---+---+---+---+---+---+---+
|  A|  B|  C|  D|  F|  G|  H|
+---+---+---+---+---+---+---+
| A0| B0| C0| D0| B4| C4| D4|
| A1| B1| C1| D1| B5| C5| D5|
+---+---+---+---+---+---+---+



In [107]:
lefts.join(rights,on='A',how='full').orderBy('A',ascending=True).show()

+---+----+----+----+----+----+----+
|  A|   B|   C|   D|   F|   G|   H|
+---+----+----+----+----+----+----+
| A0|  B0|  C0|  D0|  B4|  C4|  D4|
| A1|  B1|  C1|  D1|  B5|  C5|  D5|
| A2|  B2|  C2|  D2|null|null|null|
| A3|  B3|  C3|  D3|null|null|null|
| A6|null|null|null|  B6|  C6|  D6|
| A7|null|null|null|  B7|  C7|  D7|
+---+----+----+----+----+----+----+

