## RDD 1.3 Convert between RDD and Dataframe

In [1]:
import findspark
findspark.init()

In [2]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/24 18:43:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/24 18:43:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/24 18:43:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
mtcars = spark.read.csv(path='/notebooks/Data/SparkData/mtcars.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True,
                        inferSchema=True)                       

                                                                                

In [4]:
mtcars.rdd.take(2)

25/01/24 18:46:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
 Schema: _c0, mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
Expected: _c0 but found: 
CSV file: file:///notebooks/Data/SparkData/mtcars.csv
                                                                                

[Row(_c0='Mazda RX4', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.62, qsec=16.46, vs=0, am=1, gear=4, carb=4),
 Row(_c0='Mazda RX4 Wag', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.875, qsec=17.02, vs=0, am=1, gear=4, carb=4)]

In [5]:
mtcars_map = mtcars.rdd.map(lambda x: (x['_c0'], x['mpg']))
mtcars_map.take(5)

25/01/24 18:47:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
 Schema: _c0, mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
Expected: _c0 but found: 
CSV file: file:///notebooks/Data/SparkData/mtcars.csv


[('Mazda RX4', 21.0),
 ('Mazda RX4 Wag', 21.0),
 ('Datsun 710', 22.8),
 ('Hornet 4 Drive', 21.4),
 ('Hornet Sportabout', 18.7)]

In [7]:
mtcars_mapvalues = mtcars_map.mapValues(lambda x: [x, x * 10])
mtcars_mapvalues.take(5)

25/01/24 18:53:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
 Schema: _c0, mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
Expected: _c0 but found: 
CSV file: file:///notebooks/Data/SparkData/mtcars.csv


[('Mazda RX4', [21.0, 210.0]),
 ('Mazda RX4 Wag', [21.0, 210.0]),
 ('Datsun 710', [22.8, 228.0]),
 ('Hornet 4 Drive', [21.4, 214.0]),
 ('Hornet Sportabout', [18.7, 187.0])]

In [8]:
rdd_raw = sc.textFile('/notebooks/Data/SparkData/mtcars.csv')
rdd_raw.take(5)

[',mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4',
 'Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1',
 'Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1']

In [9]:
header = rdd_raw.map(lambda x: x.split(',')).filter(lambda x: x[1] == 'mpg').collect()[0]
header[0] = 'model'
header

['model',
 'mpg',
 'cyl',
 'disp',
 'hp',
 'drat',
 'wt',
 'qsec',
 'vs',
 'am',
 'gear',
 'carb']

In [10]:
rdd = rdd_raw.map(lambda x: x.split(',')).filter(lambda x: x[1] != 'mpg')
rdd.take(2)

[['Mazda RX4',
  '21',
  '6',
  '160',
  '110',
  '3.9',
  '2.62',
  '16.46',
  '0',
  '1',
  '4',
  '4'],
 ['Mazda RX4 Wag',
  '21',
  '6',
  '160',
  '110',
  '3.9',
  '2.875',
  '17.02',
  '0',
  '1',
  '4',
  '4']]

In [12]:
# Convert RDD elements to RDD Row objects
'''
First we define a function which takes a list of column names and a list of values and 
create a Row of key-valuepairs. Since keys in a Row object are variable names, 
we can't simply pass a dictionary to the Row() function. We can think of a dictionary 
as an argument list and use the ** to unpack the argument list.
See an example
'''
from pyspark.sql import Row
my_dict = dict(zip(['a', 'b', 'c'], range(1, 4)))
Row(**my_dict)

Row(a=1, b=2, c=3)

In [13]:
# Let's define a function
def list_to_row(keys, values):
    row_dict = dict(zip(keys, values))
    return Row(**row_dict)

In [14]:
rdd_rows = rdd.map(lambda x: list_to_row(header, x))
rdd_rows.take(3)

[Row(model='Mazda RX4', mpg='21', cyl='6', disp='160', hp='110', drat='3.9', wt='2.62', qsec='16.46', vs='0', am='1', gear='4', carb='4'),
 Row(model='Mazda RX4 Wag', mpg='21', cyl='6', disp='160', hp='110', drat='3.9', wt='2.875', qsec='17.02', vs='0', am='1', gear='4', carb='4'),
 Row(model='Datsun 710', mpg='22.8', cyl='4', disp='108', hp='93', drat='3.85', wt='2.32', qsec='18.61', vs='1', am='1', gear='4', carb='1')]

In [None]:
# Now we can convert the RDD to a Data