In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD_TO_DF_AND_DF_TO_RDD").getOrCreate()

In [2]:
sc = spark.sparkContext

### Creating an RDD

In [3]:
rdd_in = sc.textFile("marks.csv")

In [4]:
rdd_in.collect()

['ROLL_NO|GENDER|SUBJECT|MARKS_OBTAINED|TOTAL_MARKS',
 '1001|MALE|ENGLISH|84|100',
 '1001|MALE|Physics|65|100',
 '1001|MALE|Maths|45|100',
 '1001|MALE|Science|25|100',
 '1001|MALE|History|32|100',
 '1002|FEMALE|ENGLISH|84|100',
 '1002|FEMALE|Physics|64|100',
 '1002|FEMALE|Maths|45|100',
 '1002|FEMALE|Science|25|100',
 '1002|FEMALE|History|32|100']

In [5]:
rdd_in_split_on_delimiter = rdd_in.map(lambda x:x.split("|"))

In [6]:
rdd_in_split_on_delimiter.collect()

[['ROLL_NO', 'GENDER', 'SUBJECT', 'MARKS_OBTAINED', 'TOTAL_MARKS'],
 ['1001', 'MALE', 'ENGLISH', '84', '100'],
 ['1001', 'MALE', 'Physics', '65', '100'],
 ['1001', 'MALE', 'Maths', '45', '100'],
 ['1001', 'MALE', 'Science', '25', '100'],
 ['1001', 'MALE', 'History', '32', '100'],
 ['1002', 'FEMALE', 'ENGLISH', '84', '100'],
 ['1002', 'FEMALE', 'Physics', '64', '100'],
 ['1002', 'FEMALE', 'Maths', '45', '100'],
 ['1002', 'FEMALE', 'Science', '25', '100'],
 ['1002', 'FEMALE', 'History', '32', '100']]

In [7]:
header = rdd_in_split_on_delimiter.first()

In [8]:
header

['ROLL_NO', 'GENDER', 'SUBJECT', 'MARKS_OBTAINED', 'TOTAL_MARKS']

In [11]:
rdd_in_split_on_delimiter_header_removed = rdd_in_split_on_delimiter.filter(lambda line:line!=header)

In [12]:
rdd_in_split_on_delimiter_header_removed.collect()

[['1001', 'MALE', 'ENGLISH', '84', '100'],
 ['1001', 'MALE', 'Physics', '65', '100'],
 ['1001', 'MALE', 'Maths', '45', '100'],
 ['1001', 'MALE', 'Science', '25', '100'],
 ['1001', 'MALE', 'History', '32', '100'],
 ['1002', 'FEMALE', 'ENGLISH', '84', '100'],
 ['1002', 'FEMALE', 'Physics', '64', '100'],
 ['1002', 'FEMALE', 'Maths', '45', '100'],
 ['1002', 'FEMALE', 'Science', '25', '100'],
 ['1002', 'FEMALE', 'History', '32', '100']]

In [13]:
df_from_rdd = spark.createDataFrame(rdd_in_split_on_delimiter_header_removed,header)

In [14]:
df_from_rdd.printSchema()

root
 |-- ROLL_NO: string (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- SUBJECT: string (nullable = true)
 |-- MARKS_OBTAINED: string (nullable = true)
 |-- TOTAL_MARKS: string (nullable = true)



In [15]:
df_from_rdd.show()

+-------+------+-------+--------------+-----------+
|ROLL_NO|GENDER|SUBJECT|MARKS_OBTAINED|TOTAL_MARKS|
+-------+------+-------+--------------+-----------+
|   1001|  MALE|ENGLISH|            84|        100|
|   1001|  MALE|Physics|            65|        100|
|   1001|  MALE|  Maths|            45|        100|
|   1001|  MALE|Science|            25|        100|
|   1001|  MALE|History|            32|        100|
|   1002|FEMALE|ENGLISH|            84|        100|
|   1002|FEMALE|Physics|            64|        100|
|   1002|FEMALE|  Maths|            45|        100|
|   1002|FEMALE|Science|            25|        100|
|   1002|FEMALE|History|            32|        100|
+-------+------+-------+--------------+-----------+



## Now converting DF to RDD

In [16]:
rdd_from_df = df_from_rdd.rdd

In [17]:
rdd_from_df.collect()

[Row(ROLL_NO='1001', GENDER='MALE', SUBJECT='ENGLISH', MARKS_OBTAINED='84', TOTAL_MARKS='100'),
 Row(ROLL_NO='1001', GENDER='MALE', SUBJECT='Physics', MARKS_OBTAINED='65', TOTAL_MARKS='100'),
 Row(ROLL_NO='1001', GENDER='MALE', SUBJECT='Maths', MARKS_OBTAINED='45', TOTAL_MARKS='100'),
 Row(ROLL_NO='1001', GENDER='MALE', SUBJECT='Science', MARKS_OBTAINED='25', TOTAL_MARKS='100'),
 Row(ROLL_NO='1001', GENDER='MALE', SUBJECT='History', MARKS_OBTAINED='32', TOTAL_MARKS='100'),
 Row(ROLL_NO='1002', GENDER='FEMALE', SUBJECT='ENGLISH', MARKS_OBTAINED='84', TOTAL_MARKS='100'),
 Row(ROLL_NO='1002', GENDER='FEMALE', SUBJECT='Physics', MARKS_OBTAINED='64', TOTAL_MARKS='100'),
 Row(ROLL_NO='1002', GENDER='FEMALE', SUBJECT='Maths', MARKS_OBTAINED='45', TOTAL_MARKS='100'),
 Row(ROLL_NO='1002', GENDER='FEMALE', SUBJECT='Science', MARKS_OBTAINED='25', TOTAL_MARKS='100'),
 Row(ROLL_NO='1002', GENDER='FEMALE', SUBJECT='History', MARKS_OBTAINED='32', TOTAL_MARKS='100')]