In [1]:
import findspark

findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSession object methods").getOrCreate()
sc = spark.sparkContext

In [3]:
spark

In [4]:
spark.conf.get("spark.app.id")

'local-1696091739737'

In [6]:
spark.conf.get("spark.submit.deployMode")

'client'

In [7]:
spark.conf.get("spark.app.name")

'SparkSession object methods'

In [47]:
spark.conf.isModifiable("spark.scheduler.mode")

False

In [48]:
spark.conf.isModifiable("spark.sql.shuffle.partitions")

True

# 1. createDataFrame()
<hr>

In [5]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Union[float, NoneType] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
    
    When ``schema`` is a list of column names, the type of each column
    will be inferred from ``data``.
    
    When ``schema`` is ``None``, it will try to infer the schema (column names and types)
    from ``data``, which should be an RDD of either :class:`Row`,
    :class:`namedtuple`, or :class:`dict`.
    
    When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
    the real data, or an exception will be thrown at runtime. If the 

## Creating DataFrame from List of dictionaries:-
--------------------------------------------
Dictionaries contain keys and values, and each key becomes the column name, value becomes the value of the column, and datatypes are also automatically inferred from python datatypes. <b> So, passing <i>'schema'</i> explicitly is not necessary.</b>

In [15]:
details = [
    {
    "fname": "Debanjan",
    "lname": "Sarkar",
    "age": 23,
    "degree": "btech",
    "stream": "ECE",
    "dgpa": 8.84
    },
    {
    "fname": "Subhash Chandra",
    "lname": "Bose",
    "age": 120,
    "degree": "llb",
    "stream": "Law",
    "dgpa": 10.00
    }
]

type(details)

list

In [16]:
df1 = spark.createDataFrame( data = details )

In [17]:
df1.show()

+---+------+----+---------------+------+------+
|age|degree|dgpa|          fname| lname|stream|
+---+------+----+---------------+------+------+
| 23| btech|8.84|       Debanjan|Sarkar|   ECE|
|120|   llb|10.0|Subhash Chandra|  Bose|   Law|
+---+------+----+---------------+------+------+



In [48]:
df1.printSchema()

root
 |-- age: long (nullable = true)
 |-- degree: string (nullable = true)
 |-- dgpa: double (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- stream: string (nullable = true)



In [49]:
df1.dtypes

[('age', 'bigint'),
 ('degree', 'string'),
 ('dgpa', 'double'),
 ('fname', 'string'),
 ('lname', 'string'),
 ('stream', 'string')]

In [50]:
df1.schema

StructType([StructField('age', LongType(), True), StructField('degree', StringType(), True), StructField('dgpa', DoubleType(), True), StructField('fname', StringType(), True), StructField('lname', StringType(), True), StructField('stream', StringType(), True)])

## Creating DataFrame from List of tuples:-
--------------------------------------------------
Datatypes are automatically inferred, and we just need to pass the column names as a list of strings in the <b><i>'schema'</i></b> parameter.

In [31]:
data_list = [
    ('Debanjan', 'Sarkar', 23, 'Male'),
    ('Kadombini', 'Devi', 165, 'Female')
]

In [32]:
df_list = spark.createDataFrame( data=data_list )

In [33]:
df_list.show()

+---------+------+---+------+
|       _1|    _2| _3|    _4|
+---------+------+---+------+
| Debanjan|Sarkar| 23|  Male|
|Kadombini|  Devi|165|Female|
+---------+------+---+------+



In [34]:
list_schema = ["fname", "lname", "age", "gender"]

df_list2 = spark.createDataFrame( data = data_list, schema = list_schema )

In [35]:
df_list2.show()

+---------+------+---+------+
|    fname| lname|age|gender|
+---------+------+---+------+
| Debanjan|Sarkar| 23|  Male|
|Kadombini|  Devi|165|Female|
+---------+------+---+------+



## Creating DataFrame from RDD:-
--------------------------------
RDD does not recognises datatypes or schema on its own, and also do not have column names contained.
Thus, in the <b><i>'schema'</i></b> parameter, the schema must be of type <b><i>'StructType()'</i></b>, or must be a <b>string, containing column names and datatypes, separated by comma.</b>

In [6]:
rdd = sc.textFile('data/products.csv').map( lambda x: x.split(",") ).map( lambda x: (x[0], x[1],  x[2], x[4], x[5]) )

rdd.take(10)

[('1',
  '2',
  'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
  '59.98',
  'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy'),
 ('2',
  '2',
  "Under Armour Men's Highlight MC Football Clea",
  '129.99',
  'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat'),
 ('3',
  '2',
  "Under Armour Men's Renegade D Mid Football Cl",
  '89.99',
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'),
 ('4',
  '2',
  "Under Armour Men's Renegade D Mid Football Cl",
  '89.99',
  'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat'),
 ('5',
  '2',
  'Riddell Youth Revolution Speed Custom Footbal',
  '199.99',
  'http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet'),
 ('6',
  '2',
  "Jordan Men's VI Retro TD Football Cleat",
  '134.99',
  'http://images.acmesports.sports/Jordan+Men%27s+VI+Retro+TD+Football+Cleat'),
 ('7',
  '2',
  'S

In [7]:
rdd.count()

1345

In [12]:
rdd_small = rdd.sample(False, 0.007)

In [13]:
rdd_small.count()

6

In [14]:
rdd_small.collect()

[('305',
  '38',
  'Mio ALPHA Heart Rate Monitor/Sport Watch',
  '199.0',
  'http://images.acmesports.sports/Mio+ALPHA+Heart+Rate+Monitor%2FSport+Watch'),
 ('346',
  '16',
  'Fitness Gear Pro Half Rack',
  '349.99',
  'http://images.acmesports.sports/Fitness+Gear+Pro+Half+Rack'),
 ('382',
  '18',
  "PUMA Men's evoPOWER 1 Tricks FG Soccer Cleat",
  '189.99',
  'http://images.acmesports.sports/PUMA+Men%27s+evoPOWER+1+Tricks+FG+Soccer+Cleat'),
 ('463',
  '21',
  "Nike Kids' Grade School KD VI Basketball Shoe",
  '99.99',
  'http://images.acmesports.sports/Nike+Kids%27+Grade+School+KD+VI+Basketball+Shoe'),
 ('625',
  '29',
  "Nike Men's Kobe IX Elite Low Basketball Shoe",
  '199.99',
  'http://images.acmesports.sports/Nike+Men%27s+Kobe+IX+Elite+Low+Basketball+Shoe'),
 ('1231',
  '55',
  '"adidas Original Men\'s 2014 MLS All-Star Game "',
  '28.0',
  'http://images.acmesports.sports/adidas+Original+Men%27s+2014+MLS+All-Star+Game+Homestyle...')]

In [10]:
rdd_df1 = spark.createDataFrame( data = rdd_small )

In [11]:
rdd_df1.show(truncate = 50)

+----+---+-----------------------------------------------+------+--------------------------------------------------+
|  _1| _2|                                             _3|    _4|                                                _5|
+----+---+-----------------------------------------------+------+--------------------------------------------------+
| 224| 11|"PowerBlock Classic 50 lb Adjustable Dumbbell "|299.99|http://images.acmesports.sports/PowerBlock+Clas...|
| 261| 12|      ASICS Women's GEL-Cumulus 15 Running Shoe| 89.99|http://images.acmesports.sports/ASICS+Women%27s...|
| 292| 38|         Garmin Women's Forerunner 10 GPS Watch|129.99|http://images.acmesports.sports/Garmin+Women%27...|
| 329| 15|  Under Armour Women's Essential Banded Tank To| 49.99|http://images.acmesports.sports/Under+Armour+Wo...|
| 355| 16|      Nike Men's Free Trainer 5.0 Training Shoe| 99.99|http://images.acmesports.sports/Nike+Men%27s+Fr...|
| 580| 27|"adidas Original Men's 2014 MLS All-Star Game "|  28.0

In [15]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

# Define the schema using StructType and StructField
# rdd_schema = StructType([
#     StructField("productId", IntegerType(), True),
#     StructField("categoryId", IntegerType(), True),
#     StructField("description", StringType(), True),
#     StructField("price", FloatType(), True),
#     StructField("imageLink", StringType(), True)
# ])

rdd_schema = ["product_id", "category_id", "description", "price", "image_link"]
#rdd_schema = ('productId int, categoryId int, description string, price float, imageLink string')

rdd_df2 = spark.createDataFrame( data = rdd_small, schema = rdd_schema )

In [16]:
rdd_df2.show()

+----------+-----------+--------------------+------+--------------------+
|product_id|category_id|         description| price|          image_link|
+----------+-----------+--------------------+------+--------------------+
|       305|         38|Mio ALPHA Heart R...| 199.0|http://images.acm...|
|       346|         16|Fitness Gear Pro ...|349.99|http://images.acm...|
|       382|         18|PUMA Men's evoPOW...|189.99|http://images.acm...|
|       463|         21|Nike Kids' Grade ...| 99.99|http://images.acm...|
|       625|         29|Nike Men's Kobe I...|199.99|http://images.acm...|
|      1231|         55|"adidas Original ...|  28.0|http://images.acm...|
+----------+-----------+--------------------+------+--------------------+



In [52]:
rdd_df2.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- price: string (nullable = true)
 |-- image_link: string (nullable = true)



## Creating DataFrame from List of <i>'pyspark.sql.Row'</i> objects :-
----------------------------------------------------------------------

In [53]:
from pyspark.sql import Row

In [54]:
row_data = [
    Row(fname='Debanjan', lname='Sarkar', age=23),
    Row(fname='JC', lname='Bose', age=180)
]

In [55]:
type(row_data[0])

pyspark.sql.types.Row

In [56]:
row_df = spark.createDataFrame( data = row_data )

In [57]:
row_df.show()

+--------+------+---+
|   fname| lname|age|
+--------+------+---+
|Debanjan|Sarkar| 23|
|      JC|  Bose|180|
+--------+------+---+



In [58]:
row_df.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: long (nullable = true)



In [59]:
row_df.schema

StructType([StructField('fname', StringType(), True), StructField('lname', StringType(), True), StructField('age', LongType(), True)])

<hr><hr>

# <i>'pyspark.sql.Row' object:-
<hr>
<ul>
    <li> <i>'Row'</i> object takes any number of keyword arguments, and the data passed as keyword arguments form key-value paired data. </li>
    <li> The values inside a <i>'Row'</i> object can be accessed using dot(.) notation. </li>
    <li> <i>'Row'</i> object is an iterable, and can be operated by a <b><i>for</i></b> loop to get the values in the object. </li>
</ul>

In [17]:
from pyspark.sql import Row

In [18]:
row_obj = Row( fname='Debanjan', lname = 'Sarkar', age=23, degree='btech', stream='ECE', dgpa=8.84 )

In [19]:
print( row_obj.fname )
print( row_obj.lname )

Debanjan
Sarkar


In [21]:
'age' in row_obj

True

In [22]:
'fname' in row_obj

True

In [23]:
'fullName' in row_obj

False

In [28]:
for i in row_obj:
    print(i)

Debanjan
Sarkar
23
btech
ECE
8.84


# 2. sql()
<hr>
Return the result set of an sql query, as a DataFrame.

In [29]:
help(spark.sql)

Help on method sql in module pyspark.sql.session:

sql(sqlQuery: str, **kwargs: Any) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Returns a :class:`DataFrame` representing the result of the given query.
    When ``kwargs`` is specified, this method formats the given string by using the Python
    standard formatter.
    
    .. versionadded:: 2.0.0
    
    Parameters
    ----------
    sqlQuery : str
        SQL query string.
    kwargs : dict
        Other variables that the user wants to set that can be referenced in the query
    
        .. versionchanged:: 3.3.0
           Added optional argument ``kwargs`` to specify the mapping of variables in the query.
           This feature is experimental and unstable.
    
    Returns
    -------
    :class:`DataFrame`
    
    Examples
    --------
    Executing a SQL query.
    
    >>> spark.sql("SELECT * FROM range(10) where id > 7").show()
    +---+
    | id|
    +---+
    |  8|
    |  9|

In [36]:
data_list = [
    ('Debanjan', 'Sarkar', 23, 'Male'),
    ('Kadombini', 'Devi', 165, 'Female')
]

In [37]:
df_list = spark.createDataFrame( data = data_list, schema = ['fname', 'lname', 'age', 'gender'] )

In [38]:
df_list.show()

+---------+------+---+------+
|    fname| lname|age|gender|
+---------+------+---+------+
| Debanjan|Sarkar| 23|  Male|
|Kadombini|  Devi|165|Female|
+---------+------+---+------+



In [39]:
df_list.createOrReplaceTempView('basicTable')

In [40]:
sql_df = spark.sql("SELECT fname, age FROM basicTable;")

In [41]:
sql_df.show()

+---------+---+
|    fname|age|
+---------+---+
| Debanjan| 23|
|Kadombini|165|
+---------+---+



In [44]:
spark.sql(" SELECT current_timestamp AS Current_date; ").show( truncate = False )

+--------------------------+
|Current_date              |
+--------------------------+
|2023-09-29 19:13:27.858621|
+--------------------------+



# 3. table()
<hr>
Reads a table and returns it as a dataframe. Can be used to read temporary views too.<br>
We just need to pass the table name as a string in the parameter, and no sql query.

In [45]:
spark.table('basicTable').show()

+---------+------+---+------+
|    fname| lname|age|gender|
+---------+------+---+------+
| Debanjan|Sarkar| 23|  Male|
|Kadombini|  Devi|165|Female|
+---------+------+---+------+



# 4. Spark UDFs:-
-----------------------

In [20]:
# Using the df created above using rdd
rdd_df2.show(truncate=50)

+----------+-----------+-----------------------------------------------+------+--------------------------------------------------+
|product_id|category_id|                                    description| price|                                        image_link|
+----------+-----------+-----------------------------------------------+------+--------------------------------------------------+
|       305|         38|       Mio ALPHA Heart Rate Monitor/Sport Watch| 199.0|http://images.acmesports.sports/Mio+ALPHA+Heart...|
|       346|         16|                     Fitness Gear Pro Half Rack|349.99|http://images.acmesports.sports/Fitness+Gear+Pr...|
|       382|         18|   PUMA Men's evoPOWER 1 Tricks FG Soccer Cleat|189.99|http://images.acmesports.sports/PUMA+Men%27s+ev...|
|       463|         21|  Nike Kids' Grade School KD VI Basketball Shoe| 99.99|http://images.acmesports.sports/Nike+Kids%27+Gr...|
|       625|         29|   Nike Men's Kobe IX Elite Low Basketball Shoe|199.99|http

In [24]:
# Creating  UDF that takes a string and returns s String that is the unification of first alphabets of consecutive words in the main sentence
# Ex:- "I am very happy" ---> compress_str() ---> "Iavh"
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

@udf( returnType = StringType() )
def compress_str(main_str: str):
    if(main_str):
        # This will ensure that this logic is not applied if Null is passed.
        comp_str = ""
        for word in main_str.split():
            comp_str += word[0]
        return comp_str
    else:
        return ""

In [29]:
# Using the created UDF on DataFrame column
# ------------------------------------------

from pyspark.sql import functions as F

rdd_df2.select( [ F.col("description"), compress_str(F.col("description")) ] ).show(truncate=70)

+-----------------------------------------------+-------------------------+
|                                    description|compress_str(description)|
+-----------------------------------------------+-------------------------+
|       Mio ALPHA Heart Rate Monitor/Sport Watch|                   MAHRMW|
|                     Fitness Gear Pro Half Rack|                    FGPHR|
|   PUMA Men's evoPOWER 1 Tricks FG Soccer Cleat|                 PMe1TFSC|
|  Nike Kids' Grade School KD VI Basketball Shoe|                 NKGSKVBS|
|   Nike Men's Kobe IX Elite Low Basketball Shoe|                 NMKIELBS|
|"adidas Original Men's 2014 MLS All-Star Game "|                 "OM2MAG"|
+-----------------------------------------------+-------------------------+



## Registering the UDF for Spark SQL:-
--------------------------------------

In [30]:
spark.sql(""" SELECT compressString("I am currently living in Jaipur, Rajasthan."); """)

AnalysisException: Undefined function: compressString. This function is neither a built-in/temporary function, nor a persistent function that is qualified as spark_catalog.default.compressstring.; line 1 pos 8

In [32]:
spark.udf.register("compressString", compress_str )

<function __main__.compress_str(main_str: str)>

In [34]:
spark.sql(""" SELECT compressString("I am currently living in Jaipur, Rajasthan."); """).show()

+-----------------------------------------------------------+
|compressString(I am currently living in Jaipur, Rajasthan.)|
+-----------------------------------------------------------+
|                                                    IacliJR|
+-----------------------------------------------------------+



# 5. <i>'catalog'</i> Module :
<hr>

In [5]:
help( spark.catalog )

Help on Catalog in module pyspark.sql.catalog object:

class Catalog(builtins.object)
 |  Catalog(sparkSession: pyspark.sql.session.SparkSession) -> None
 |  
 |  User-facing catalog API, accessible through `SparkSession.catalog`.
 |  
 |  This is a thin wrapper around its Scala implementation org.apache.spark.sql.catalog.Catalog.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sparkSession: pyspark.sql.session.SparkSession) -> None
 |      Create a new Catalog that wraps the underlying JVM object.
 |  
 |  cacheTable(self, tableName: str) -> None
 |      Caches the specified table in-memory.
 |      
 |      .. versionadded:: 2.0
 |  
 |  clearCache(self) -> None
 |      Removes all cached tables from the in-memory cache.
 |      
 |      .. versionadded:: 2.0
 |  
 |  createExternalTable(self, tableName: str, path: Union[str, NoneType] = None, source: Union[str, NoneType] = None, schema: Union[pyspark.sql.types.StructType, NoneType] = None, **options: str) -> pyspark.sql.dataf

In [10]:
spark.conf.get("spark.sql.sources.default")

'parquet'

### a. Database Functions:-
----------------------------

In [17]:
spark.catalog.currentDatabase()

'default'

In [7]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/C:/Users/Debanjan%20Sarkar/PySpark/prac1/2-complete-PySpark-developer-course/spark-warehouse')]

In [8]:
spark.sql("CREATE DATABASE test_db;")

DataFrame[]

In [9]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/C:/Users/Debanjan%20Sarkar/PySpark/prac1/2-complete-PySpark-developer-course/spark-warehouse'),
 Database(name='test_db', description='', locationUri='file:/C:/Users/Debanjan%20Sarkar/PySpark/prac1/2-complete-PySpark-developer-course/spark-warehouse/test_db.db')]

In [16]:
spark.catalog.setCurrentDatabase('test_db')
# spark.catalog.setCurrentDatabase('default')

In [13]:
spark.catalog.currentDatabase()

'test_db'

In [14]:
# Names of all the databases present currently in the session:

databases_present = [ db.name for db in spark.catalog.listDatabases() ]
databases_present

['default', 'test_db']

### b. Table Functions:-
----------------------------

In [18]:
spark.catalog.listTables()

[]

In [20]:
spark.sql(" CREATE TABLE testTbl (id INT, name STRING); ")

AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT);
'CreateTable `default`.`testTbl`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists
