In [1]:
!ls *.csv

marks.csv  marks_without_header.csv  weatherHistory.csv


In [2]:
cat marks_without_header.csv |head -5

1001|MALE|ENGLISH|84|100
1001|MALE|Physics|65|100
1001|MALE|Maths|45|100
1001|MALE|Science|25|100
1001|MALE|History|32|100


## SO there is no schema/header defined to the  csv file

## Lets try adding schema to the dataframe

#### Method 1 : Using RDD

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("define_schema").getOrCreate()
sc = spark.sparkContext

In [5]:
rdd_in = sc.textFile("marks_without_header.csv")

In [6]:
rdd_in.collect()

['1001|MALE|ENGLISH|84|100',
 '1001|MALE|Physics|65|100',
 '1001|MALE|Maths|45|100',
 '1001|MALE|Science|25|100',
 '1001|MALE|History|32|100',
 '1002|FEMALE|ENGLISH|84|100',
 '1002|FEMALE|Physics|64|100',
 '1002|FEMALE|Maths|45|100',
 '1002|FEMALE|Science|25|100',
 '1002|FEMALE|History|32|100']

In [8]:
rdd_in_split_on_delimiter = rdd_in.map(lambda x:x.split("|"))

In [9]:
rdd_in_split_on_delimiter.collect()

[['1001', 'MALE', 'ENGLISH', '84', '100'],
 ['1001', 'MALE', 'Physics', '65', '100'],
 ['1001', 'MALE', 'Maths', '45', '100'],
 ['1001', 'MALE', 'Science', '25', '100'],
 ['1001', 'MALE', 'History', '32', '100'],
 ['1002', 'FEMALE', 'ENGLISH', '84', '100'],
 ['1002', 'FEMALE', 'Physics', '64', '100'],
 ['1002', 'FEMALE', 'Maths', '45', '100'],
 ['1002', 'FEMALE', 'Science', '25', '100'],
 ['1002', 'FEMALE', 'History', '32', '100']]

In [10]:
# Creating a DataFrame and passing the header name
header_col = 'ROLL_NO|GENDER|SUBJECT|MARKS_OBTAINED|TOTAL_MARKS'.split("|")
print(header_col)

['ROLL_NO', 'GENDER', 'SUBJECT', 'MARKS_OBTAINED', 'TOTAL_MARKS']


In [11]:
df_with_header = spark.createDataFrame(rdd_in_split_on_delimiter,header_col)

In [12]:
df_with_header.show()

+-------+------+-------+--------------+-----------+
|ROLL_NO|GENDER|SUBJECT|MARKS_OBTAINED|TOTAL_MARKS|
+-------+------+-------+--------------+-----------+
|   1001|  MALE|ENGLISH|            84|        100|
|   1001|  MALE|Physics|            65|        100|
|   1001|  MALE|  Maths|            45|        100|
|   1001|  MALE|Science|            25|        100|
|   1001|  MALE|History|            32|        100|
|   1002|FEMALE|ENGLISH|            84|        100|
|   1002|FEMALE|Physics|            64|        100|
|   1002|FEMALE|  Maths|            45|        100|
|   1002|FEMALE|Science|            25|        100|
|   1002|FEMALE|History|            32|        100|
+-------+------+-------+--------------+-----------+



## Method 2 : By defining StuctType

In [13]:
from pyspark.sql.types import *

In [15]:
sch  = StructType([
    StructField("ROLL_NO",IntegerType(),nullable=True),
    StructField("GENDER",StringType(),nullable=True),
    StructField("SUBJECT",StringType(),nullable=True),
    StructField("MARKS_OBTAINED",IntegerType(),nullable=True),
    StructField("TOTAL_MARKS",IntegerType(),nullable=True)
    
])

In [16]:
df_with_schema = spark.read.csv("marks_without_header.csv",sep='|',schema=sch)

In [18]:
df_with_schema.printSchema()

root
 |-- ROLL_NO: integer (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- SUBJECT: string (nullable = true)
 |-- MARKS_OBTAINED: integer (nullable = true)
 |-- TOTAL_MARKS: integer (nullable = true)



In [19]:
df_with_schema.show()

+-------+------+-------+--------------+-----------+
|ROLL_NO|GENDER|SUBJECT|MARKS_OBTAINED|TOTAL_MARKS|
+-------+------+-------+--------------+-----------+
|   1001|  MALE|ENGLISH|            84|        100|
|   1001|  MALE|Physics|            65|        100|
|   1001|  MALE|  Maths|            45|        100|
|   1001|  MALE|Science|            25|        100|
|   1001|  MALE|History|            32|        100|
|   1002|FEMALE|ENGLISH|            84|        100|
|   1002|FEMALE|Physics|            64|        100|
|   1002|FEMALE|  Maths|            45|        100|
|   1002|FEMALE|Science|            25|        100|
|   1002|FEMALE|History|            32|        100|
+-------+------+-------+--------------+-----------+



In [20]:
df_with_header.printSchema()

root
 |-- ROLL_NO: string (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- SUBJECT: string (nullable = true)
 |-- MARKS_OBTAINED: string (nullable = true)
 |-- TOTAL_MARKS: string (nullable = true)

