In [3]:
!pip install findspark

Collecting findspark
  Using cached findspark-1.3.0-py2.py3-none-any.whl (3.0 kB)
Installing collected packages: findspark
Successfully installed findspark-1.3.0


In [4]:

!pip install pyspark

Collecting pyspark
  Downloading pyspark-2.4.4.tar.gz (215.7 MB)
[K     |████████████████████████████████| 215.7 MB 28 kB/s s eta 0:00:01  |▍                               | 2.4 MB 3.1 MB/s eta 0:01:10     |███                             | 20.4 MB 13.4 MB/s eta 0:00:15     |███▊                            | 25.2 MB 13.4 MB/s eta 0:00:15     |████▎                           | 29.0 MB 9.0 MB/s eta 0:00:21     |████▌                           | 30.7 MB 9.0 MB/s eta 0:00:21     |████▊                           | 31.5 MB 9.0 MB/s eta 0:00:21     |████▉                           | 32.3 MB 9.0 MB/s eta 0:00:21     |█████▎                          | 35.7 MB 9.0 MB/s eta 0:00:20     |████████▌                       | 57.2 MB 43.2 MB/s eta 0:00:04     |████████▋                       | 58.0 MB 43.2 MB/s eta 0:00:04     |██████████████▊                 | 99.5 MB 4.8 MB/s eta 0:00:25     |████████████████▌               | 111.0 MB 8.1 MB/s eta 0:00:13     |█████████████████               | 113.9

In [1]:
import pyspark

## Spark DataFrame Basics

In [2]:
from pyspark.sql import SparkSession,SQLContext

In [3]:
spark = SparkSession.builder.appName('Basics')

In [4]:
spark = spark.getOrCreate()

In [5]:
df = spark.read.json('/home/ubuntu/pyspark/data/people.json')

In [6]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [23]:
df.columns

['age', 'name']

In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [7]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [8]:
appl_stock = spark.read.csv('/home/ubuntu/pyspark/data/appl_stock.csv')

In [9]:
appl_stock.show()

++
||
++
++



In [13]:
!df -h ~/

Filesystem      Size  Used Avail Use% Mounted on
/dev/xvda1      7.7G  3.3G  4.5G  43% /


In [10]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [12]:
!java -version

openjdk version "1.8.0_242"
OpenJDK Runtime Environment (build 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.242-b08, mixed mode)


In [11]:
from pyspark.sql.types import *


### Learning Apache 

#### https://runawayhorse001.github.io/LearningApacheSpark

## Programming with RDDs

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### Using parallelize( )

In [2]:
df = spark.sparkContext.parallelize([('1','Joe','70000','1'),('2', 'Henry', '80000', None)]).toDF(['Id', 'Name', 'Sallary','DepartmentId'])

In [3]:
df.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|        null|
+---+-----+-------+------------+



### Using createDataFrame( )

In [7]:
df = spark.createDataFrame([('1', 'Joe', '70000', '1'),
('2', 'Henry', '80000', None)],
['Id','Name','Sallary','DepartmentId'])

In [8]:
df.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|        null|
+---+-----+-------+------------+



In [10]:
df = spark.sparkContext.parallelize([(1, 2, 3, 'a b c'),
             (4, 5, 6, 'd e f'),
             (7, 8, 9, 'g h i')]).toDF(['col1', 'col2', 'col3','col4'])

In [12]:
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [20]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

myData = spark.sparkContext.parallelize([(1,2), (3,4), (5,6), (7,8), (9,10)])

In [22]:
myData.collect()

[(1, 2), (3, 4), (5, 6), (7, 8), (9, 10)]

In [23]:
Employee = spark.createDataFrame([
                        ('1', 'Joe',   '70000', '1'),
                        ('2', 'Henry', '80000', '2'),
                        ('3', 'Sam',   '60000', '2'),
                        ('4', 'Max',   '90000', '1')],
                        ['Id', 'Name', 'Sallary','DepartmentId']
                       )

In [24]:
Employee.show()

+---+-----+-------+------------+
| Id| Name|Sallary|DepartmentId|
+---+-----+-------+------------+
|  1|  Joe|  70000|           1|
|  2|Henry|  80000|           2|
|  3|  Sam|  60000|           2|
|  4|  Max|  90000|           1|
+---+-----+-------+------------+



In [25]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').\
                load("/home/ubuntu/pyspark/data/Advertising.csv",header=True)

In [26]:
df.show()

+---+-----+-----+---------+-----+
|_c0|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
|  6|  8.7| 48.9|     75.0|  7.2|
|  7| 57.5| 32.8|     23.5| 11.8|
|  8|120.2| 19.6|     11.6| 13.2|
|  9|  8.6|  2.1|      1.0|  4.8|
| 10|199.8|  2.6|     21.2| 10.6|
| 11| 66.1|  5.8|     24.2|  8.6|
| 12|214.7| 24.0|      4.0| 17.4|
| 13| 23.8| 35.1|     65.9|  9.2|
| 14| 97.5|  7.6|      7.2|  9.7|
| 15|204.1| 32.9|     46.0| 19.0|
| 16|195.4| 47.7|     52.9| 22.4|
| 17| 67.8| 36.6|    114.0| 12.5|
| 18|281.4| 39.6|     55.8| 24.4|
| 19| 69.2| 20.5|     18.3| 11.3|
| 20|147.3| 23.9|     19.1| 14.6|
+---+-----+-----+---------+-----+
only showing top 20 rows



In [27]:
ds = spark.read.csv(path='/home/ubuntu/pyspark/data/Advertising.csv',header=True,
               inferSchema=True)

In [29]:
ds.show(5)

+---+-----+-----+---------+-----+
|_c0|   TV|radio|newspaper|sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
+---+-----+-----+---------+-----+
only showing top 5 rows



In [31]:
ds.head(5)

[Row(_c0=1, TV=230.1, radio=37.8, newspaper=69.2, sales=22.1),
 Row(_c0=2, TV=44.5, radio=39.3, newspaper=45.1, sales=10.4),
 Row(_c0=3, TV=17.2, radio=45.9, newspaper=69.3, sales=9.3),
 Row(_c0=4, TV=151.5, radio=41.3, newspaper=58.5, sales=18.5),
 Row(_c0=5, TV=180.8, radio=10.8, newspaper=58.4, sales=12.9)]

In [39]:
ds.select(['_c0','TV']).collect()[0][0]

1

In [4]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.18.1-cp36-cp36m-manylinux1_x86_64.whl (20.1 MB)
[K     |████████████████████████████████| 20.1 MB 2.9 MB/s eta 0:00:01    |█▍                              | 860 kB 2.9 MB/s eta 0:00:07     |████████▍                       | 5.3 MB 2.9 MB/s eta 0:00:06     |███████████                     | 7.0 MB 2.9 MB/s eta 0:00:05     |█████████████████████▋          | 13.6 MB 2.9 MB/s eta 0:00:03
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.18.1


In [5]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.0.1-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 2.8 MB/s eta 0:00:01
[?25hCollecting pytz>=2017.2
  Downloading pytz-2019.3-py2.py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 53.9 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.0.1 pytz-2019.3
