In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder \
    .appName('Creating Dataframe') \
    .enableHiveSupport() \
    .getOrCreate()

In [0]:
data=[
    (1,"Alice","Mumbai","2023-01-15",True),
    (2,"Bob","Delhi","2023-03-25",False),
    (3,"Charlie","Chennai","2023-05-10",True)
]

columns=["customer_id","name","city","registration_date","is_active"]



In [0]:
df_csv=spark.read.format('csv').option('header','true').load('/FileStore/tables/customers.csv')

In [0]:
df_csv.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|West Bengal|  India|       2023-10-10|     True|
|          1| Customer_1|Bangalore|    Gujarat|  India|       2023-10-19|    False|
|          2| Customer_2|Bangalore|  Karnataka|  India|       2023-02-10|     True|
|          3| Customer_3|Bangalore|  Telangana|  India|       2023-03-24|     True|
|          4| Customer_4|Hyderabad|  Telangana|  India|       2023-06-04|    False|
|          5| Customer_5|Hyderabad|West Bengal|  India|       2023-07-26|     True|
|          6| Customer_6|Hyderabad|  Karnataka|  India|       2023-08-07|    False|
|          7| Customer_7|Bangalore|  Telangana|  India|       2023-08-25|     True|
|          8| Customer_8|Bangalore|Maharashtra|  India|       2023-07-13|   

In [0]:
# Spark SQL

spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|customers|      false|
+--------+---------+-----------+



In [0]:
df_sql=spark.sql('select * from customers where is_active=True')
df_sql.show()

+-----------+-------+-------+-----------------+---------+
|customer_id|   name|   city|registration_date|is_active|
+-----------+-------+-------+-----------------+---------+
|          3|Charlie|Chennai|       2023-05-10|     true|
|          1|  Alice| Mumbai|       2023-01-15|     true|
+-----------+-------+-------+-----------------+---------+



In [0]:
# Spark table

df_table=spark.table('customers')
df_table.show()

+-----------+-------+-------+-----------------+---------+
|customer_id|   name|   city|registration_date|is_active|
+-----------+-------+-------+-----------------+---------+
|          3|Charlie|Chennai|       2023-05-10|     true|
|          1|  Alice| Mumbai|       2023-01-15|     true|
|          2|    Bob|  Delhi|       2023-03-25|    false|
+-----------+-------+-------+-----------------+---------+



In [0]:
# Spark range

df_range=spark.range(0,10)
df_range.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [0]:
# using create dataframe

data=[
    (1,"Alice","Mumbai","2023-01-15",True),
    (2,"Bob","Delhi","2023-03-25",False),
    (3,"Charlie","Chennai","2023-05-10",True)
]

columns=["customer_id","name","city","registration_date","is_active"]

df_list=spark.createDataFrame(data,columns)
df_list.show()

+-----------+-------+-------+-----------------+---------+
|customer_id|   name|   city|registration_date|is_active|
+-----------+-------+-------+-----------------+---------+
|          1|  Alice| Mumbai|       2023-01-15|     true|
|          2|    Bob|  Delhi|       2023-03-25|    false|
|          3|Charlie|Chennai|       2023-05-10|     true|
+-----------+-------+-------+-----------------+---------+



In [0]:
# RDD ----> DF

rdd=spark.sparkContext.parallelize([(1,"Alice"),(2,"Bob")])
rdd.take(2)

Out[17]: [(1, 'Alice'), (2, 'Bob')]

In [0]:
df_rdd=rdd.toDF(["customer_id","name"])

In [0]:
df_rdd.show()

+-----------+-----+
|customer_id| name|
+-----------+-----+
|          1|Alice|
|          2|  Bob|
+-----------+-----+

