<a href="https://colab.research.google.com/github/Dev-Parmar17/ADF/blob/main/PYSPARK_PRACTICE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window


In [4]:
spark = SparkSession.builder.appName("SparkByExamples.com").getOrCreate()



In [8]:
df = spark.read.format('json')\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load('transactions.json')

In [16]:
df.show()

+---------------+------+--------+-------+------+
|_corrupt_record|amount|customer| status|txn_id|
+---------------+------+--------+-------+------+
|           NULL|   500|     101|success|     1|
|           NULL|   200|     101| failed|     2|
|           NULL|   800|     102|success|     3|
+---------------+------+--------+-------+------+



In [17]:
df.drop(col('_corrupt_record'))

DataFrame[amount: bigint, customer: bigint, status: string, txn_id: bigint]

In [18]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- amount: long (nullable = true)
 |-- customer: long (nullable = true)
 |-- status: string (nullable = true)
 |-- txn_id: long (nullable = true)



For each customer:

total amount

successful amount

failed count

Use groupBy + ag

In [28]:
grouped_df = df.groupBy("customer") \
              .agg(\
                  sum('amount').alias('total amount'),\
                  sum(when(col("status") == "success", col("amount")).otherwise(0)).alias('successful amount'),\
                  sum(when(col("status") == "failed", 1).otherwise(0)).alias('failed count')\
              )

In [29]:
grouped_df.show()

+--------+------------+-----------------+------------+
|customer|total amount|successful amount|failed count|
+--------+------------+-----------------+------------+
|     101|         700|              500|           1|
|     102|         800|              800|           0|
+--------+------------+-----------------+------------+



In [35]:
df.fillna(0, subset=['amount']).show()

+------+--------+-------+------+
|amount|customer| status|txn_id|
+------+--------+-------+------+
|   500|     101|success|     1|
|   200|     101| failed|     2|
|   800|     102|success|     3|
+------+--------+-------+------+



In [33]:
df = df.drop(col('_corrupt_record'))

In [36]:
grouped_df.show()

+--------+------------+-----------------+------------+
|customer|total amount|successful amount|failed count|
+--------+------------+-----------------+------------+
|     101|         700|              500|           1|
|     102|         800|              800|           0|
+--------+------------+-----------------+------------+



In [37]:
grouped_df.write.mode("overwrite").parquet("grouped_df.parquet")


In [38]:
spark.read.parquet("grouped_df.parquet").show()

+--------+------------+-----------------+------------+
|customer|total amount|successful amount|failed count|
+--------+------------+-----------------+------------+
|     101|         700|              500|           1|
|     102|         800|              800|           0|
+--------+------------+-----------------+------------+



In [40]:
df.createOrReplaceTempView("df_view")
spark.sql("select * from df_view").show()

+------+--------+-------+------+
|amount|customer| status|txn_id|
+------+--------+-------+------+
|   500|     101|success|     1|
|   200|     101| failed|     2|
|   800|     102|success|     3|
+------+--------+-------+------+



In [43]:
spark.sql('''select customer
          from df_view
          where status = 'failed'
          group by customer
          having count(*) > 3''').show()

+--------+
|customer|
+--------+
+--------+



In [46]:
spark.sql('''select max(amount) as second_high_salary
          from df_view
          where amount < (select max(amount)
          from df_view)''').show()

+------------------+
|second_high_salary|
+------------------+
|               500|
+------------------+



In [68]:
error_count = 0
with open('log.txt', 'r') as f:
    for line in f:
        if "ERROR" in line:
            error_count += 1
print(f"Number of lines containing 'ERROR': {error_count}")

Number of lines containing 'ERROR': 4


In [64]:
# Create a file with specific parameters
with open(
    'my_example_file.txt',
    mode='w+',
    encoding='utf-8',
    errors='replace',
    newline='',
    buffering=1
) as f:
    f.write('Hello, world!\n')
    f.write('This is another line.\n')
    f.write('Here are some special characters: \u03B1\u03B2\u03B3\n') # Greek letters

    # Seek to the beginning of the file to read
    f.seek(0)
    content = f.read()
    print("File content:\n" + content)

# Example of print() with 'sep' and 'end'
print("These", "are", "separated", "by", "-", sep='-', end='!\n')
print("This is on the same line because of the previous 'end' parameter.")

File content:
Hello, world!
This is another line.
Here are some special characters: αβγ

These-are-separated-by--!
This is on the same line because of the previous 'end' parameter.
