# DATA READING

### Data Reading JSON

In [None]:
df_json = spark.read.format('json').option('inferSchema',True)\
                    .option('header',True)\
                    .option('multiLine',False)\
                    .load('/FileStore/tables/drivers.json')

In [None]:
df_json.display()

### Data Reading Utils

In [None]:
dbutils.fs.ls('/FileStore/tables/')

In [None]:
df = spark.read.format('csv').option('inferSchema',True).option('header',True).load('/FileStore/tables/BigMart_Sales.csv')

In [None]:
df.display()

### Schema Definition

In [None]:
df.printSchema()

### DDL SCHEMA

In [None]:
my_ddl_schema = '''
                    Item_Identifier STRING,
                    Item_Weight STRING,
                    Item_Fat_Content STRING,
                    Item_Visibility DOUBLE,
                    Item_Type STRING,
                    Item_MRP DOUBLE,
                    Outlet_Identifier STRING,
                    Outlet_Establishment_Year INT,
                    Outlet_Size STRING,
                    Outlet_Location_Type STRING,
                    Outlet_Type STRING,
                    Item_Outlet_Sales DOUBLE

                '''

In [None]:
df = spark.read.format('csv')\
            .schema(my_ddl_schema)\
            .option('header',True)\
            .load('/FileStore/tables/BigMart_Sales.csv')

In [None]:
df.display()

In [None]:
df.printSchema()

### StructType() Schema

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

my_strct_schema = StructType([
                                StructField('Item_Identifier',StringType(),True),
                                StructField('Item_Weight',StringType(),True),
                                StructField('Item_Fat_Content',StringType(),True),
                                StructField('Item_Visibility',StringType(),True),
                                StructField('Item_MRP',StringType(),True),
                                StructField('Outlet_Identifier',StringType(),True),
                                StructField('Outlet_Establishment_Year',StringType(),True),
                                StructField('Outlet_Size',StringType(),True),
                                StructField('Outlet_Location_Type',StringType(),True),
                                StructField('Outlet_Type',StringType(),True),
                                StructField('Item_Outlet_Sales',StringType(),True)

])

df = spark.read.format('csv')\
            .schema(my_strct_schema)\
            .option('header',True)\
            .load('/FileStore/tables/BigMart_Sales.csv')

In [None]:
df.printSchema()

# TRANSFORMATIONS

### SELECT

In [None]:
df.display()

In [None]:
df.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')).display()

### ALIAS

In [None]:
df.select(col('Item_Identifier').alias('Item_ID')).display()

In [None]:
df.display()

### FILTER

#### Scenario - 1

In [None]:
df.filter(col('Item_Fat_Content')=='Regular').display()

#### Scenario - 2

In [None]:
df.filter((col('Item_Type') == 'Soft Drinks') & (col('Item_Weight')<10)).display()

#### Scenario - 3

In [None]:
df.filter((col('Outlet_Size').isNull()) & (col('Outlet_Location_Type').isin('Tier 1','Tier 2'))).display()

### withColumnRenamed

In [None]:
df.withColumnRenamed('Item_Weight','Item_Wt').display()

### withColumn

#### Scenario - 1

In [None]:
df = df.withColumn('flag',lit("new"))

In [2]:
df.display()

NameError: name 'df' is not defined

In [None]:
df.withColumn('multiply',col('Item_Weight')*col('Item_MRP')).display()

#### Scenario - 2

In [3]:
df = df.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),"Regular","Reg"))\
    .withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),"Low Fat","Lf"))

df.display()

NameError: name 'df' is not defined

### Type Casting

In [None]:
df = df.withColumn('Item_Weight', col('Item_Weight').cast(StringType()))

In [None]:
df.printSchema()

### sort

#### Scenario - 1

In [None]:
df.sort(col('Item_Weight').desc()).display()

#### Scenario - 2

In [None]:
df.sort(col('Item_Visibility').asc()).display()

#### Scenario - 3

In [None]:
df.sort(['Item_Weight','Item_Visibility'],ascending = [0,0]).display()

#### Scenario - 4

In [None]:
df.sort(['Item_weight','Item_Visibility'], ascending = [0,1]).display()

### Limit

In [None]:
df.limit(10).display()

### DROP

#### Scenario-1

In [4]:
df.drop('Item_Visibility').display()

NameError: name 'df' is not defined

#### Scenario-2

In [None]:
df.drop('Item_Visibility','Item_Type').display()

### DRop_Duplicates

In [None]:
df.dropDuplicates().display()

#### Scenario - 2

In [None]:
df.drop_duplicates(subset=['Item_Type']).display()

In [None]:
df.distinct().display()

### UNION and UNION BY NAME

####Preaparing Dataframes

In [None]:
data1 = [('1','kad'),
        ('2','sid')]
schema1 = 'id STRING, name STRING'

df1 = spark.createDataFrame(data1,schema1)

data2 = [('3','rahul'),
        ('4','jas')]
schema2 = 'id STRING, name STRING'

df2 = spark.createDataFrame(data2,schema2)



In [None]:
df1.display()

In [None]:
df2.display()

### Union

In [None]:
df1.union(df2).display()

In [None]:
data1 = [('kad','1',),
        ('sid','2',)]
schema1 = 'name STRING, id STRING'

df1 = spark.createDataFrame(data1,schema1)

df1.display()

In [None]:
df1.union(df2).display()

### Union by Name

In [None]:
df1.unionByName(df2).display()

### String Functions

#### Initcap()

In [None]:
df.select(upper('Item_Type').alias('upper_Item_Type')).display()

### Date Functions

#### Current_Date

In [None]:
df = df.withColumn('curr_date',current_date())

df.display()

#### Date_Add()

In [None]:
df = df.withColumn('week_after',date_add('curr_date',7))

df.display()

#### Date_Sub()

In [None]:
df.withColumn('week_before',date_sub('curr_date',7)).display()

In [None]:
df = df.withColumn('week_before',date_add('curr_date',-7))

df.display()

### DateDIFF

In [None]:
df = df.withColumn('datediff',datediff('week_after','curr_date'))

df.display()

### Date_Format()

In [None]:
df = df.withColumn('week_before',date_format('week_before','dd-MM-yyyy'))

df.display()