In [1]:
from pyspark.sql.session import SparkSession

sc=SparkSession.builder.master('local').appName("csvReader").getOrCreate()

In [14]:
#Read CSV file into DataFrame, with header
df=sc.read.option("header",True).csv("auto-data.csv")
df.printSchema()

root
 |-- MAKE: string (nullable = true)
 |-- FUELTYPE: string (nullable = true)
 |-- ASPIRE: string (nullable = true)
 |-- DOORS: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- DRIVE: string (nullable = true)
 |-- CYLINDERS: string (nullable = true)
 |-- HP: string (nullable = true)
 |-- RPM: string (nullable = true)
 |-- MPG-CITY: string (nullable = true)
 |-- MPG-HWY: string (nullable = true)
 |-- PRICE: string (nullable = true)



In [17]:
#Read CSV file into DataFrame, without header
df1=sc.read.csv("auto-data-without-column_Name.csv")
df1.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [18]:
print(type(df1))

<class 'pyspark.sql.dataframe.DataFrame'>


In [41]:
# Like an RDD, a DataFrame is an immutable distributed collection of data.
# Unlike an RDD, data is organized into named columns, like a table in a relational database
df2=df1.toDF("1","2","3","4","5","6","7","8","9","10","11","12")

In [42]:
df2.printSchema()

root
 |-- 1: string (nullable = true)
 |-- 2: string (nullable = true)
 |-- 3: string (nullable = true)
 |-- 4: string (nullable = true)
 |-- 5: string (nullable = true)
 |-- 6: string (nullable = true)
 |-- 7: string (nullable = true)
 |-- 8: string (nullable = true)
 |-- 9: string (nullable = true)
 |-- 10: integer (nullable = true)
 |-- 11: integer (nullable = true)
 |-- 12: integer (nullable = true)



In [43]:

from pyspark.sql.types import StringType, IntegerType, StructType, StructField

In [44]:


# .schema(table_schema)
#create user defined schema
CustomSchema = StructType([StructField('MAKE', StringType(), True),
                     StructField('FUELTYPE', StringType(), True),
                     StructField('ASPIRE', StringType(), True),
                     StructField('DOORS', StringType(), True),
                     StructField('BODY', StringType(), True),
                     StructField('DRIVE', StringType(), True),
                     StructField('CYLINDERS', StringType(), True),
                     StructField('HP', StringType(), True),
                     StructField('RPM', StringType(), True),
                     StructField('MPG-CITY', IntegerType(), True),
                     StructField('MPG-HWY', IntegerType(), True),
                     StructField('PRICE', IntegerType(), True)])

#Read CSV file into DataFrame, without header
df1=sc.read.csv("auto-data-without-column_Name.csv",schema=CustomSchema)
df1.printSchema()

root
 |-- MAKE: string (nullable = true)
 |-- FUELTYPE: string (nullable = true)
 |-- ASPIRE: string (nullable = true)
 |-- DOORS: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- DRIVE: string (nullable = true)
 |-- CYLINDERS: string (nullable = true)
 |-- HP: string (nullable = true)
 |-- RPM: string (nullable = true)
 |-- MPG-CITY: integer (nullable = true)
 |-- MPG-HWY: integer (nullable = true)
 |-- PRICE: integer (nullable = true)

