### Data Reading





In [0]:
df = spark.read.format('csv').option('inferSchema', True).option('header',True).load('/Volumes/workspace/first_schema/big_data/BigMart Sales.csv')


In [0]:
display(df)

### Data Reading json

In [0]:
df_json = spark.read.format('json').option('inferSchema',True)\
                    .option('header',True)\
                    .option('multiLine',False)\
                    .load('/Volumes/workspace/first_schema/big_data/drivers.json')


In [0]:
display(df_json)

### Schema Definition


In [0]:
df.printSchema()

### DDL Schema

In [0]:
my_ddl_schema = '''
                   Item_Identifier STRING,
                   Item_Weight STRING,
                   Item_Fat_Content STRING,
                   Item_Visibility DOUBLE,
                   Item_Type STRING,
                   Item_MRP DOUBLE,
                   Outlet_Identifier STRING,
                   Outlet_Establishment_Year INTEGER,
                   Outlet_Size STRING,
                   Outlet_Location_Type STRING,
                   Outlet_Type STRING,
                   Item_Outlet_Sales DOUBLE
                '''

In [0]:
df = spark.read.format('csv').schema(my_ddl_schema).option('header', True).load('/Volumes/workspace/first_schema/big_data/BigMart Sales.csv')

In [0]:
df.display()

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
df.printSchema()

### SELECT

In [0]:
df.select('Item_Identifier','Item_Weight','Item_Fat_Content').display()

In [0]:
df.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')).display()

### ALIAS

In [0]:
df.select(col('Item_Identifier').alias('Item_ID')).display()

### FILTER

### SCENARIO - Fetch records with regular fat content

In [0]:
df.filter(col('Item_Fat_Content')=='Regular').display()

### SCENARIO Fetch records with item type as soft drink and item weight less than 10

In [0]:
df.filter( (col('Item_Type')=='Soft Drinks') & (col('Item_Weight')<10)).display()

### Scenario - Fetch the data with tier in (Tier1 or Tier 2) and outlet size is null

In [0]:
df.filter( (col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1','Tier 2'))).display()

### withColumnRenamed

In [0]:
df.withColumnRenamed('Item_Weight','Item_WT').display()

### withColumn


### Creating new column

In [0]:
df = df.withColumn('flag',lit('new'))  

In [0]:
df.display()

In [0]:
df.withColumn('multiply',col('Item_Weight')*col('Item_MRP')).display()

### Modifying existing column values

In [0]:
df.withColumn('Item_Fat_Content',regexp_replace('Item_Fat_Content','Regular','Reg'))\
     .withColumn('Item_Fat_Content',regexp_replace('Item_Fat_Content','Low Fat','Lf')).display()

### Type Casting

In [0]:
df = df.withColumn('Item_Weight',col('Item_Weight').cast(StringType()))

In [0]:
df.printSchema()

### sort data

### Scenario 1

In [0]:
df.sort(col('Item_Weight').desc()).display()

### Scenario 2

In [0]:
df.sort(col('Item_Visibility').asc()).display()

### Scenario 3

In [0]:
df.sort(['Item_Weight','Item_Visibility'],ascending=[0,0]).display()

### Scenario 4

In [0]:
df.sort(['Item_Weight','Item_Visibility'],ascending=[0,1]).display()

### Limit

In [0]:
df.limit(10).display()

#### Dropping columns

### Scenario 1

In [0]:
df.drop('Item_Visibility').display()

### Scenario 2

In [0]:
df.drop('Item_Visibility','Item_Type').display()

### Drop_Duplicates

In [0]:
df.dropDuplicates().display()

### Scenario 2 - drop duplicates based on Item_types column

In [0]:
df.dropDuplicates(subset=['Item_Type']).display()

In [0]:
df.distinct().display()