In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.types import *

In [0]:
data = [(1, 'Amal'), (2, 'Shritej')]
df=spark.createDataFrame(data)
df.show()

+---+-------+
| _1|     _2|
+---+-------+
|  1|   Amal|
|  2|Shritej|
+---+-------+



In [0]:
spark = SparkSession.builder.appName("Example").getOrCreate()

data = [(1, 'Amal'), (2, 'Shritej')]


schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()

+---+-------+
| ID|   Name|
+---+-------+
|  1|   Amal|
|  2|Shritej|
+---+-------+



In [0]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)



In [0]:
df.toPandas()
display(df)

ID,Name
1,Amal
2,Shritej


In [0]:
help(pyspark.sql.types)

Help on module pyspark.sql.types in pyspark.sql:

NAME
    pyspark.sql.types

DESCRIPTION
    # Licensed to the Apache Software Foundation (ASF) under one or more
    # contributor license agreements.  See the NOTICE file distributed with
    # this work for additional information regarding copyright ownership.
    # The ASF licenses this file to You under the Apache License, Version 2.0
    # (the "License"); you may not use this file except in compliance with
    # the License.  You may obtain a copy of the License at
    #
    #    http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

CLASSES
    builtins.object
        DataType
            ArrayType


In [0]:
help(StructType)

Help on class StructType in module pyspark.sql.types:

class StructType(DataType)
 |  StructType(fields: Optional[List[pyspark.sql.types.StructField]] = None)
 |  
 |  Struct type, consisting of a list of :class:`StructField`.
 |  
 |  This is the data type representing a :class:`Row`.
 |  
 |  Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
 |  A contained :class:`StructField` can be accessed by its name or position.
 |  
 |  Examples
 |  --------
 |  >>> from pyspark.sql.types import *
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1["f1"]
 |  StructField('f1', StringType(), True)
 |  >>> struct1[0]
 |  StructField('f1', StringType(), True)
 |  
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", CharType(10), True)])
 |  >>> struct2 = StructType([S

In [0]:
data = [(1, 'Nilesh'), (2, 'Nita')]

schema1 = StructType([
    StructField("num", IntegerType()),
    StructField("First_Name", StringType())
])

df = spark.createDataFrame(data, schema1)
df.show()

+---+----------+
|num|First_Name|
+---+----------+
|  1|    Nilesh|
|  2|      Nita|
+---+----------+



In [0]:
data = [(1, 'Nilesh',12.5,True), (2, 'Nita',15.0,True)]

schema1 = StructType([
    StructField("num", IntegerType()),
    StructField("First_Name", StringType()),
    StructField("Marks", FloatType()),
    StructField("pass", BooleanType())
])

df = spark.createDataFrame(data, schema1)
df.show()
#display(data)

+---+----------+-----+----+
|num|First_Name|Marks|pass|
+---+----------+-----+----+
|  1|    Nilesh| 12.5|true|
|  2|      Nita| 15.0|true|
+---+----------+-----+----+



In [0]:
data = [{'a':1,'b':2},{'c':3,'d':4},{'e':5,'f':6},{'g':7,'h':8},{'i':9,'j':10}]

columns = sorted(set().union(*data))
standardized_data = [{col: row.get(col, None) for col in columns} for row in data]

df = spark.createDataFrame(standardized_data)
df.show()

+----+----+----+----+----+----+----+----+----+----+
|   a|   b|   c|   d|   e|   f|   g|   h|   i|   j|
+----+----+----+----+----+----+----+----+----+----+
|   1|   2|null|null|null|null|null|null|null|null|
|null|null|   3|   4|null|null|null|null|null|null|
|null|null|null|null|   5|   6|null|null|null|null|
|null|null|null|null|null|null|   7|   8|null|null|
|null|null|null|null|null|null|null|null|   9|  10|
+----+----+----+----+----+----+----+----+----+----+



In [0]:
help(spark.read)

Help on DataFrameReader in module pyspark.sql.readwriter object:

class DataFrameReader(OptionUtils)
 |  DataFrameReader(spark: 'SparkSession')
 |  
 |  Interface used to load a :class:`DataFrame` from external storage systems
 |  (e.g. file systems, key-value stores, etc). Use :attr:`SparkSession.read`
 |  to access this.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  .. versionchanged:: 3.4.0
 |      Support Spark Connect.
 |  
 |  Method resolution order:
 |      DataFrameReader
 |      OptionUtils
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, spark: 'SparkSession')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  csv(self, path: Union[str, List[str]], schema: Union[pyspark.sql.types.StructType, str, NoneType] = None, sep: Optional[str] = None, encoding: Optional[str] = None, quote: Optional[str] = None, escape: Optional[str] = None, comment: Optional[str] = None, header: Union[bool, str, NoneType] = None, inferSchema: Union

In [0]:

data1='dbfs:/FileStore/Pokemon.csv'
df = spark.read.csv(data1,header=True,inferSchema=True)
#df.show(5)
display(df.tail(5))

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True
721,Volcanion,Fire,Water,600,80,110,120,130,90,70,6,True


In [0]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/Pokemon.csv")

df.show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur| Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur| Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur| Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...| Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

In [0]:
data = [
    (1, "Pikachu", "Electric"),
    (2, "Charizard", "Fire"),
    (3, "Bulbasaur", "Grass"),
    (4, "Squirtle", "Water")
]

schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Type", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()

df.write.mode("overwrite").option("header", "true").csv("dbfs:/FileStore/Pokemon1.csv")

+---+---------+--------+
| ID|     Name|    Type|
+---+---------+--------+
|  1|  Pikachu|Electric|
|  2|Charizard|    Fire|
|  3|Bulbasaur|   Grass|
|  4| Squirtle|   Water|
+---+---------+--------+



In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/"))


path,name,size,modificationTime
dbfs:/FileStore/Advertising.csv,Advertising.csv,5469,1739858922000
dbfs:/FileStore/FR_category_id.json,FR_category_id.json,7911,1739786498000
dbfs:/FileStore/Pokemon.csv,Pokemon.csv,44028,1739780751000
dbfs:/FileStore/Pokemon1.csv/,Pokemon1.csv/,0,0
dbfs:/FileStore/Pokemon2.csv/,Pokemon2.csv/,0,0
dbfs:/FileStore/iris.parquet,iris.parquet,2448,1739855970000
dbfs:/FileStore/my_data.parquet/,my_data.parquet/,0,0
dbfs:/FileStore/pokedex.json,pokedex.json,1054097,1739786884000
dbfs:/FileStore/table.parquet,table.parquet,4312,1739855997000
dbfs:/FileStore/tables/,tables/,0,0


In [0]:
df_loaded = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/Pokemon1.csv")
df_loaded.show()


+---+---------+--------+
| ID|     Name|    Type|
+---+---------+--------+
|  1|  Pikachu|Electric|
|  3|Bulbasaur|   Grass|
|  2|Charizard|    Fire|
|  4| Squirtle|   Water|
+---+---------+--------+



In [0]:
data = [
    (5, "Raichu", "Electric"),
    (6, "Infernape", "Fire"),
    (7, "Treecko", "Grass"),
    (8, "Totodile", "Water")
]

schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Type", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()

df.write.mode("overwrite").option("header", "true").csv("dbfs:/FileStore/Pokemon2.csv")
#df.write.csv("dbfs:/FileStore/Pokemon2.csv",header=True)
#if a file is added twice in a df then overewrite is used another mode id ignore.

+---+---------+--------+
| ID|     Name|    Type|
+---+---------+--------+
|  5|   Raichu|Electric|
|  6|Infernape|    Fire|
|  7|  Treecko|   Grass|
|  8| Totodile|   Water|
+---+---------+--------+



In [0]:
df_loaded = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/Pokemon2.csv")
df_loaded.show()


+---+---------+--------+
| ID|     Name|    Type|
+---+---------+--------+
|  5|   Raichu|Electric|
|  6|Infernape|    Fire|
|  8| Totodile|   Water|
|  7|  Treecko|   Grass|
+---+---------+--------+



In [0]:
df_loaded = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(["dbfs:/FileStore/Pokemon1.csv","dbfs:/FileStore/Pokemon2.csv"]).orderBy('ID')
df_loaded.show()


+---+---------+--------+
| ID|     Name|    Type|
+---+---------+--------+
|  1|  Pikachu|Electric|
|  2|Charizard|    Fire|
|  3|Bulbasaur|   Grass|
|  4| Squirtle|   Water|
|  5|   Raichu|Electric|
|  6|Infernape|    Fire|
|  7|  Treecko|   Grass|
|  8| Totodile|   Water|
+---+---------+--------+



In [0]:
data1='dbfs:/FileStore/Pokemon.csv'
df1 = spark.read.csv(data1,header=True,inferSchema=True)
df1.count()

Out[236]: 800

In [0]:
print(df1.columns)

['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary']


In [0]:
df1.filter(df1["Type 1"]== "Fire").show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Charizard|  Fire|Flying|  534| 78|    84|     78|    109|     85|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Dragon|  634| 78|   130|    111|    130|     85|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Flying|  634| 78|   104|     78|    159|    115|  100|         1|    false|
| 37|              Vulpix|  Fire|  null|  299| 38|    41|     40|     50|     65|   65|         1|    false|
| 38|           Nin

In [0]:
df1.filter((df1["Type 1"]== "Fire") | (df1["Type 2"]== "Water") ).show()

+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  4|          Charmander|  Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon|  Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Charizard|  Fire|Flying|  534| 78|    84|     78|    109|     85|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Dragon|  634| 78|   130|    111|    130|     85|  100|         1|    false|
|  6|CharizardMega Cha...|  Fire|Flying|  634| 78|   104|     78|    159|    115|  100|         1|    false|
| 37|              Vulpix|  Fire|  null|  299| 38|    41|     40|     50|     65|   65|         1|    false|
| 38|           Nin

In [0]:
df1.filter((df1["Type 1"]== "Fire") & (df1["Type 2"]== "Water") ).show()

+---+---------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|     Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+---------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|721|Volcanion|  Fire| Water|  600| 80|   110|    120|    130|     90|   70|         6|     true|
+---+---------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+



In [0]:
df1.where("Name = 'Diancie'").show()


+---+-------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|   Name|Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+-------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|719|Diancie|  Rock| Fairy|  600| 50|   100|    150|    100|    150|   50|         6|     true|
+---+-------+------+------+-----+---+------+-------+-------+-------+-----+----------+---------+



In [0]:
df1.orderBy("Name").show(5)


+---+--------------------+-------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name| Type 1|Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+-------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|460|           Abomasnow|  Grass|   Ice|  494| 90|    92|     75|     92|     85|   60|         4|    false|
|460|AbomasnowMega Abo...|  Grass|   Ice|  594| 90|   132|    105|    132|    105|   30|         4|    false|
| 63|                Abra|Psychic|  null|  310| 25|    20|     15|    105|     55|   90|         1|    false|
|359|               Absol|   Dark|  null|  465| 65|   130|     60|     75|     60|   75|         3|    false|
|359|     AbsolMega Absol|   Dark|  null|  565| 65|   150|     60|    115|     60|  115|         3|    false|
+---+--------------------+-------+------+-----+---+------+-------+-------+-------+-----+----------+---------+
only showi

In [0]:
df1.orderBy(df1.Total.desc()).show(5)


+---+--------------------+-------+--------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name| Type 1|  Type 2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+-------+--------+-----+---+------+-------+-------+-------+-----+----------+---------+
|150| MewtwoMega Mewtwo Y|Psychic|    null|  780|106|   150|     70|    194|    120|  140|         1|     true|
|150| MewtwoMega Mewtwo X|Psychic|Fighting|  780|106|   190|    100|    154|    100|  130|         1|     true|
|384|RayquazaMega Rayq...| Dragon|  Flying|  780|105|   180|    100|    180|    100|  115|         3|     true|
|383|GroudonPrimal Gro...| Ground|    Fire|  770|100|   180|    160|    150|     90|   90|         3|     true|
|382| KyogrePrimal Kyogre|  Water|    null|  770|100|   150|     90|    180|    160|   90|         3|     true|
+---+--------------------+-------+--------+-----+---+------+-------+-------+-------+-----+----------+---

In [0]:
df1.select("Name", "Type 1").show(5)


+--------------------+------+
|                Name|Type 1|
+--------------------+------+
|           Bulbasaur| Grass|
|             Ivysaur| Grass|
|            Venusaur| Grass|
|VenusaurMega Venu...| Grass|
|          Charmander|  Fire|
+--------------------+------+
only showing top 5 rows



In [0]:
df2=df1.withColumnRenamed("Type 1", "Type1")
df3=df2.withColumnRenamed("Type 2", "Type2")
df3.show(5)

+---+--------------------+-----+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type1| Type2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+-----+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur|Grass|Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur|Grass|Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur|Grass|Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...|Grass|Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander| Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
+---+--------------------+-----+------+-----+---+------+-------+-------+-------+-----+----------+---------+
only showing top 5 rows



In [0]:
df3.na.fill("Unknown", subset=["Type1","Type2"]).show(10)


+---+--------------------+-----+-------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type1|  Type2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+-----+-------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  1|           Bulbasaur|Grass| Poison|  318| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|             Ivysaur|Grass| Poison|  405| 60|    62|     63|     80|     80|   60|         1|    false|
|  3|            Venusaur|Grass| Poison|  525| 80|    82|     83|    100|    100|   80|         1|    false|
|  3|VenusaurMega Venu...|Grass| Poison|  625| 80|   100|    123|    122|    120|   80|         1|    false|
|  4|          Charmander| Fire|Unknown|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon| Fire|Unknown|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Cha

In [0]:
df3.createOrReplaceTempView("pokemons")


In [0]:
spark.sql("SELECT * FROM pokemons WHERE Type1 = 'Fire'").show(5)

+---+--------------------+-----+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  #|                Name|Type1| Type2|Total| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+--------------------+-----+------+-----+---+------+-------+-------+-------+-----+----------+---------+
|  4|          Charmander| Fire|  null|  309| 39|    52|     43|     60|     50|   65|         1|    false|
|  5|          Charmeleon| Fire|  null|  405| 58|    64|     58|     80|     65|   80|         1|    false|
|  6|           Charizard| Fire|Flying|  534| 78|    84|     78|    109|     85|  100|         1|    false|
|  6|CharizardMega Cha...| Fire|Dragon|  634| 78|   130|    111|    130|     85|  100|         1|    false|
|  6|CharizardMega Cha...| Fire|Flying|  634| 78|   104|     78|    159|    115|  100|         1|    false|
+---+--------------------+-----+------+-----+---+------+-------+-------+-------+-----+----------+---------+
only showing top 5 rows



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


schema1 = StructType()\
    .add(field='ID', data_type=IntegerType())\
    .add(field='Name', data_type=StringType())\
    .add(field='Type', data_type=StringType())


df = spark.read.csv(path=["dbfs:/FileStore/Pokemon1.csv", "dbfs:/FileStore/Pokemon2.csv"], schema=schema1, header=True).orderBy('ID')
display(df)

ID,Name,Type
1,Pikachu,Electric
2,Charizard,Fire
3,Bulbasaur,Grass
4,Squirtle,Water
5,Raichu,Electric
6,Infernape,Fire
7,Treecko,Grass
8,Totodile,Water


In [0]:
dbutils.fs.head("dbfs:/FileStore/pokedex.json", 1000)


[Truncated to first 1000 bytes]
Out[250]: '[\n  {\n    "id": 1,\n    "name": {\n      "english": "Bulbasaur",\n      "japanese": "フシギダネ",\n      "chinese": "妙蛙种子",\n      "french": "Bulbizarre"\n    },\n    "type": ["Grass", "Poison"],\n    "base": {\n      "HP": 45,\n      "Attack": 49,\n      "Defense": 49,\n      "Sp. Attack": 65,\n      "Sp. Defense": 65,\n      "Speed": 45\n    },\n    "species": "Seed Pokémon",\n    "description": "Bulbasaur can be seen napping in bright sunlight. There is a seed on its back. By soaking up the sun’s rays, the seed grows progressively larger.",\n    "evolution": { "next": [["2", "Level 16"]] },\n    "profile": {\n      "height": "0.7 m",\n      "weight": "6.9 kg",\n      "egg": ["Monster", "Grass"],\n      "ability": [\n        ["Overgrow", "false"],\n        ["Chlorophyll", "true"]\n      ],\n      "gender": "87.5:12.5"\n    },\n    "image": {\n      "sprite": "https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sp

In [0]:
df4 = spark.read.format('json').option("multiline", "true").load("dbfs:/FileStore/pokedex.json")
df4.cache()
display(df4)


base,description,evolution,id,image,name,profile,species,type
"List(49, 49, 45, 65, 65, 45)","Bulbasaur can be seen napping in bright sunlight. There is a seed on its back. By soaking up the sun’s rays, the seed grows progressively larger.","List(List(List(2, Level 16)), null)",1,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/001.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/001.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/001.png)","List(妙蛙种子, Bulbasaur, Bulbizarre, フシギダネ)","List(List(List(Overgrow, false), List(Chlorophyll, true)), List(Monster, Grass), 87.5:12.5, 0.7 m, 6.9 kg)",Seed Pokémon,"List(Grass, Poison)"
"List(62, 63, 60, 80, 80, 60)","There is a bud on this Pokémon’s back. To support its weight, Ivysaur’s legs and trunk grow thick and strong. If it starts spending more time lying in the sunlight, it’s a sign that the bud will bloom into a large flower soon.","List(List(List(3, Level 32)), List(1, Level 16))",2,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/002.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/002.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/002.png)","List(妙蛙草, Ivysaur, Herbizarre, フシギソウ)","List(List(List(Overgrow, false), List(Chlorophyll, true)), List(Monster, Grass), 87.5:12.5, 1 m, 13 kg)",Seed Pokémon,"List(Grass, Poison)"
"List(82, 83, 80, 100, 100, 80)",There is a large flower on Venusaur’s back. The flower is said to take on vivid colors if it gets plenty of nutrition and sunlight. The flower’s aroma soothes the emotions of people.,"List(null, List(2, Level 32))",3,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/003.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/003.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/003.png)","List(妙蛙花, Venusaur, Florizarre, フシギバナ)","List(List(List(Overgrow, false), List(Chlorophyll, true)), List(Monster, Grass), 87.5:12.5, 2 m, 100 kg)",Seed Pokémon,"List(Grass, Poison)"
"List(52, 43, 39, 60, 50, 65)","The flame that burns at the tip of its tail is an indication of its emotions. The flame wavers when Charmander is enjoying itself. If the Pokémon becomes enraged, the flame burns fiercely.","List(List(List(5, Level 16)), null)",4,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/004.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/004.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/004.png)","List(小火龙, Charmander, Salamèche, ヒトカゲ)","List(List(List(Blaze, false), List(Solar Power, true)), List(Monster, Dragon), 87.5:12.5, 0.6 m, 8.5 kg)",Lizard Pokémon,List(Fire)
"List(64, 58, 58, 80, 65, 80)","Charmeleon mercilessly destroys its foes using its sharp claws. If it encounters a strong foe, it turns aggressive. In this excited state, the flame at the tip of its tail flares with a bluish white color.","List(List(List(6, Level 36)), List(4, Level 16))",5,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/005.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/005.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/005.png)","List(火恐龙, Charmeleon, Reptincel, リザード)","List(List(List(Blaze, false), List(Solar Power, true)), List(Monster, Dragon), 87.5:12.5, 1.1 m, 19 kg)",Flame Pokémon,List(Fire)
"List(84, 78, 78, 109, 85, 100)","Charizard flies around the sky in search of powerful opponents. It breathes fire of such great heat that it melts anything. However, it never turns its fiery breath on any opponent weaker than itself.","List(null, List(5, Level 36))",6,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/006.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/006.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/006.png)","List(喷火龙, Charizard, Dracaufeu, リザードン)","List(List(List(Blaze, false), List(Solar Power, true)), List(Monster, Dragon), 87.5:12.5, 1.7 m, 90.5 kg)",Flame Pokémon,"List(Fire, Flying)"
"List(48, 65, 44, 50, 64, 43)","Squirtle’s shell is not merely used for protection. The shell’s rounded shape and the grooves on its surface help minimize resistance in water, enabling this Pokémon to swim at high speeds.","List(List(List(8, Level 16)), null)",7,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/007.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/007.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/007.png)","List(杰尼龟, Squirtle, Carapuce, ゼニガメ)","List(List(List(Torrent, false), List(Rain Dish, true)), List(Monster, Water 1), 87.5:12.5, 0.5 m, 9 kg)",Tiny Turtle Pokémon,List(Water)
"List(63, 80, 59, 65, 80, 58)","Its tail is large and covered with a rich, thick fur. The tail becomes increasingly deeper in color as Wartortle ages. The scratches on its shell are evidence of this Pokémon’s toughness as a battler.","List(List(List(9, Level 36)), List(7, Level 16))",8,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/008.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/008.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/008.png)","List(卡咪龟, Wartortle, Carabaffe, カメール)","List(List(List(Torrent, false), List(Rain Dish, true)), List(Monster, Water 1), 87.5:12.5, 1 m, 22.5 kg)",Turtle Pokémon,List(Water)
"List(83, 100, 79, 85, 105, 78)",Blastoise has water spouts that protrude from its shell. The water spouts are very accurate. They can shoot bullets of water with enough accuracy to strike empty cans from a distance of over 160 feet.,"List(null, List(8, Level 36))",9,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/009.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/009.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/009.png)","List(水箭龟, Blastoise, Tortank, カメックス)","List(List(List(Torrent, false), List(Rain Dish, true)), List(Monster, Water 1), 87.5:12.5, 1.6 m, 85.5 kg)",Shellfish Pokémon,List(Water)
"List(30, 35, 45, 20, 20, 45)","Its body is soft and weak. In nature, its perpetual fate is to be seen by others as food.","List(List(List(11, Level 7)), null)",10,"List(https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/hires/010.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/sprites/010.png, https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/images/pokedex/thumbnails/010.png)","List(绿毛虫, Caterpie, Chenipan, キャタピー)","List(List(List(Shield Dust, false), List(Run Away, true)), List(Bug), 50:50, 0.3 m, 2.9 kg)",Worm Pokémon,List(Bug)


In [0]:
# Create a table using SQL
#spark.sql("CREATE TABLE one (id INT, Name STRING)")

In [0]:
# Insert values (This is only supported if you are using a Hive-supported warehouse)
#spark.sql("INSERT INTO one VALUES (1, 'John'), (2, 'Alice')")


In [0]:
# Display values from a table
spark.sql("SELECT * FROM one").show()
#ALTER TABLE table_name RENAME COLUMN old_column_name TO new_column_name;
#In pyspark use backticks(` `)to read column names with space in their name

+------+-----+
|rollno| Name|
+------+-----+
|     1| John|
|     2|Alice|
+------+-----+



In [0]:
#ALTER TABLE one SET TBLPROPERTIES ('delta.columnMapping.mode' = 'name');


In [0]:
%sql
ALTER TABLE one SET TBLPROPERTIES (
   'delta.columnMapping.mode' = 'name',
   'delta.minReaderVersion' = '2',
   'delta.minWriterVersion' = '5'
);


In [0]:
#spark.sql("ALTER TABLE one RENAME COLUMN id TO rollno")

In [0]:
spark.sql("select * from one").show()

+------+-----+
|rollno| Name|
+------+-----+
|     1| John|
|     2|Alice|
+------+-----+



A **Parquet file** is a **columnar storage format** commonly used in **big data processing**. It is optimized for performance and efficiency, especially when working with **Apache Spark, Hive, and other big data frameworks**.  

---

### **🔹 Key Features of Parquet:**
1. **Columnar Storage** → Data is stored **by columns instead of rows**, making queries on specific columns **faster**.  
2. **Compression** → Uses **Snappy, Gzip, or LZ4 compression** to reduce file size and improve performance.  
3. **Efficient Reads** → Since only **needed columns** are read, queries run much **faster** compared to row-based formats like CSV.  
4. **Schema Evolution** → Supports **adding new columns** without breaking existing data.  
5. **Splittable & Parallel Processing** → Works well in **distributed computing** environments (like Spark, Hadoop).  

---

### **🔹 Parquet vs. Other Formats**
| Format  | Storage Type | Compression | Performance |
|---------|-------------|-------------|-------------|
| **CSV**  | Row-based  | No (unless manually applied) | Slow for large datasets |
| **JSON** | Row-based  | No (unless manually applied) | Slow due to nested structure |
| **Parquet** | Columnar  | Yes (efficiently applied) | Fast for analytical queries |
| **Avro** | Row-based  | Yes | Good for streaming |

---

### **🔹 Creating & Reading Parquet in PySpark**
**🔹 Writing DataFrame to Parquet:**
```python
df.write.parquet("dbfs:/FileStore/my_data.parquet")
```

**🔹 Reading Parquet File in Spark:**
```python
df = spark.read.parquet("dbfs:/FileStore/my_data.parquet")
df.show()
```

---

### **🔹 When to Use Parquet?**
✅ When working with **large datasets**  
✅ When you need **fast queries** on **specific columns**  
✅ When using **Spark, Hadoop, or other big data frameworks**  
✅ When storage **efficiency & compression** matter  

---



In [0]:
dff = spark.read.parquet("dbfs:/FileStore/iris.parquet")
display(dff.head(5))

sepal.length,sepal.width,petal.length,petal.width,variety
5.1,3.5,1.4,0.2,Setosa
4.9,3.0,1.4,0.2,Setosa
4.7,3.2,1.3,0.2,Setosa
4.6,3.1,1.5,0.2,Setosa
5.0,3.6,1.4,0.2,Setosa


In [0]:
type(dff)

Out[261]: pyspark.sql.dataframe.DataFrame

In [0]:
#df3.write.parquet("dbfs:/FileStore/my_data.parquet")

In [0]:
dff.show(truncate=False, vertical=True)

-RECORD 0--------------
 sepal.length | 5.1    
 sepal.width  | 3.5    
 petal.length | 1.4    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 1--------------
 sepal.length | 4.9    
 sepal.width  | 3.0    
 petal.length | 1.4    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 2--------------
 sepal.length | 4.7    
 sepal.width  | 3.2    
 petal.length | 1.3    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 3--------------
 sepal.length | 4.6    
 sepal.width  | 3.1    
 petal.length | 1.5    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 4--------------
 sepal.length | 5.0    
 sepal.width  | 3.6    
 petal.length | 1.4    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 5--------------
 sepal.length | 5.4    
 sepal.width  | 3.9    
 petal.length | 1.7    
 petal.width  | 0.4    
 variety      | Setosa 
-RECORD 6--------------
 sepal.length | 4.6    
 sepal.width  | 3.4    
 petal.length | 1.4    
 petal.width  | 0.3    
 variety      | 

In [0]:
dff.show(truncate=True, vertical=True)

-RECORD 0--------------
 sepal.length | 5.1    
 sepal.width  | 3.5    
 petal.length | 1.4    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 1--------------
 sepal.length | 4.9    
 sepal.width  | 3.0    
 petal.length | 1.4    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 2--------------
 sepal.length | 4.7    
 sepal.width  | 3.2    
 petal.length | 1.3    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 3--------------
 sepal.length | 4.6    
 sepal.width  | 3.1    
 petal.length | 1.5    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 4--------------
 sepal.length | 5.0    
 sepal.width  | 3.6    
 petal.length | 1.4    
 petal.width  | 0.2    
 variety      | Setosa 
-RECORD 5--------------
 sepal.length | 5.4    
 sepal.width  | 3.9    
 petal.length | 1.7    
 petal.width  | 0.4    
 variety      | Setosa 
-RECORD 6--------------
 sepal.length | 4.6    
 sepal.width  | 3.4    
 petal.length | 1.4    
 petal.width  | 0.3    
 variety      | 

In [0]:
dff.show(truncate=4, vertical=True)

-RECORD 0------------
 sepal.length | 5.1  
 sepal.width  | 3.5  
 petal.length | 1.4  
 petal.width  | 0.2  
 variety      | S... 
-RECORD 1------------
 sepal.length | 4.9  
 sepal.width  | 3.0  
 petal.length | 1.4  
 petal.width  | 0.2  
 variety      | S... 
-RECORD 2------------
 sepal.length | 4.7  
 sepal.width  | 3.2  
 petal.length | 1.3  
 petal.width  | 0.2  
 variety      | S... 
-RECORD 3------------
 sepal.length | 4.6  
 sepal.width  | 3.1  
 petal.length | 1.5  
 petal.width  | 0.2  
 variety      | S... 
-RECORD 4------------
 sepal.length | 5.0  
 sepal.width  | 3.6  
 petal.length | 1.4  
 petal.width  | 0.2  
 variety      | S... 
-RECORD 5------------
 sepal.length | 5.4  
 sepal.width  | 3.9  
 petal.length | 1.7  
 petal.width  | 0.4  
 variety      | S... 
-RECORD 6------------
 sepal.length | 4.6  
 sepal.width  | 3.4  
 petal.length | 1.4  
 petal.width  | 0.3  
 variety      | S... 
-RECORD 7------------
 sepal.length | 5.0  
 sepal.width  | 3.4  
 petal.len

In [0]:
dff.show(truncate=False, vertical=False)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|5.1         |3.5        |1.4         |0.2        |Setosa |
|4.9         |3.0        |1.4         |0.2        |Setosa |
|4.7         |3.2        |1.3         |0.2        |Setosa |
|4.6         |3.1        |1.5         |0.2        |Setosa |
|5.0         |3.6        |1.4         |0.2        |Setosa |
|5.4         |3.9        |1.7         |0.4        |Setosa |
|4.6         |3.4        |1.4         |0.3        |Setosa |
|5.0         |3.4        |1.5         |0.2        |Setosa |
|4.4         |2.9        |1.4         |0.2        |Setosa |
|4.9         |3.1        |1.5         |0.1        |Setosa |
|5.4         |3.7        |1.5         |0.2        |Setosa |
|4.8         |3.4        |1.6         |0.2        |Setosa |
|4.8         |3.0        |1.4         |0.1        |Setosa |
|4.3         |3.0        |1.1         |0

In [0]:
Adv_df=spark.read.format('csv').option('header','True').option('inferSchema','True').load("dbfs:/FileStore/Advertising.csv")
display(Adv_df)
#InferSchema creates schema automatically according to the data types detected in the csv

Month,TV,radio,newspaper,sales
Jan-00,230.1,37.8,69.2,22.1
Feb-00,44.5,39.3,45.1,10.4
Mar-00,17.2,45.9,69.3,9.3
Apr-00,151.5,41.3,58.5,18.5
May-00,180.8,10.8,58.4,12.9
Jun-00,8.7,48.9,75.0,7.2
Jul-00,57.5,32.8,23.5,11.8
Aug-00,120.2,19.6,11.6,13.2
Sep-00,8.6,2.1,1.0,4.8
Oct-00,199.8,2.6,21.2,10.6


In [0]:
Adv_df.printSchema()

root
 |-- Month: string (nullable = true)
 |-- TV: double (nullable = true)
 |-- radio: double (nullable = true)
 |-- newspaper: double (nullable = true)
 |-- sales: double (nullable = true)



In [0]:
%sql
select*from one

rollno,Name
1,John
2,Alice


Pyspark is only and only for the purpose of processing huge bulk data, hence here insert, update and delete is not prefered and all the cleaning and pre-processiong of data is expected to be done beforehand.

In [0]:
%sql
--CREATE DATABASE IF NOT EXISTS ore

In [0]:
from pyspark.sql.functions import col
df3 = df3.withColumnRenamed("Sp. Atk", "Sp_Atk").withColumnRenamed("Sp. Def", "Sp_Def")


In [0]:

df3 = df3.select([col(c).alias(c.replace(" ", "_").replace(",", "").replace("(", "").replace(")", "")) for c in df3.columns])


In [0]:
#[i.replace(' ','_').replace('.','_') for i in df.cols if ' ' or '.' in i]
#df3 = df3.toDF(*[c.replace(" ", "_").replace(",", "").replace("(", "").replace(")", "") for c in df3.columns])

In [0]:
df3.createOrReplaceTempView("pokemon_temp")


In [0]:
%sql
USE ore;
CREATE TABLE IF NOT EXISTS pokemon AS
SELECT * FROM pokemon_temp;

num_affected_rows,num_inserted_rows


In [0]:
%sql
use ore;
select*from pokemon;

#,Name,Type1,Type2,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,Speed,Generation,Legendary
1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False
6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
7,Squirtle,Water,,314,44,48,65,50,64,43,1,False
