# 4.2 用 Pyspark 建立第一個RDD

In [None]:
from __future__ import print_function, division

## import pyspark

In [None]:
import os
import sys 

spark_home = os.environ['SPARK_HOME']
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

## 啟動 spark

In [None]:
spark = SparkSession.builder.master("local") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## Part1. Create a RDD from sparkContext

In [None]:
wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']
wordsRDD = sc.parallelize(wordsList, 4)
# Print out the type of wordsRDD
print(type(wordsRDD))


In [None]:
wordsRDD.take(3)

In [None]:
wordsRDD.collect()

## Part2 Create a Dataframe from hdfs

## put data into HDFS

In [None]:
!ls ../data

In [None]:
!head ../data/NASA_access_log_Jul95_100

In [None]:
!hadoop fs -ls /

In [None]:
!hadoop fs -put ../data/NASA_access_log_Jul95_100 /tmp

In [None]:
!hadoop fs -ls /tmp

### 從 HDFS 中讀取資料

In [None]:
textFromHDFS = spark.read.text("hdfs:///tmp/NASA_access_log_Jul95_100")

In [None]:
print(type(textFromHDFS))

In [None]:
textFromHDFS.head()

## Part3 Read csv format

In [None]:
!hadoop fs -put ../data/ratings.csv /tmp

In [None]:
!hadoop fs -tail /tmp/ratings.csv

In [None]:
path = "hdfs:///tmp/ratings.csv"
schema = None 
sep = None
header = True

In [None]:
csvDF = spark.read.csv(path = path, schema = schema, sep = sep, header = header)

In [None]:
print(type(csvDF))

In [None]:
print(csvDF)

In [None]:
csvDF.head()

### comapre with read.text

In [None]:
textDF = spark.read.text(paths = path)

In [None]:
textDF

In [None]:
textDF.head()

## Part4. Read Json file

In [None]:
!hadoop fs -put ../data/json_example.json /tmp

In [None]:
jsonDF = spark.read.json('hdfs:///tmp/json_example.json')

In [None]:
jsonDF

In [None]:
jsonDF.head()

## Part5. RDD 與 DataFrame 的轉換

In [None]:
jsonRDD = jsonDF.rdd

In [None]:
print(type(jsonDF))
print(type(jsonRDD))

In [None]:
jsonRDD.head()

In [None]:
jsonRDD.take(1)

In [None]:
jsonDF2 = spark.createDataFrame(jsonRDD)

In [None]:
type(jsonDF2)

In [None]:
jsonDF2.head()

In [None]:
sc.stop()
spark.stop()