<a href="https://colab.research.google.com/github/Dev-Parmar17/ADF/blob/main/PYSPARK_PRACTICE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window


In [4]:
spark = SparkSession.builder.appName("SparkByExamples.com").getOrCreate()



In [3]:
df = spark.read.format('json')\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load('transactions.json')

NameError: name 'spark' is not defined

In [None]:
df.show()

+---------------+------+--------+-------+------+
|_corrupt_record|amount|customer| status|txn_id|
+---------------+------+--------+-------+------+
|           NULL|   500|     101|success|     1|
|           NULL|   200|     101| failed|     2|
|           NULL|   800|     102|success|     3|
+---------------+------+--------+-------+------+



In [None]:
df.drop(col('_corrupt_record'))

DataFrame[amount: bigint, customer: bigint, status: string, txn_id: bigint]

In [None]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- amount: long (nullable = true)
 |-- customer: long (nullable = true)
 |-- status: string (nullable = true)
 |-- txn_id: long (nullable = true)



For each customer:

total amount

successful amount

failed count

Use groupBy + ag

In [None]:
grouped_df = df.groupBy("customer") \
              .agg(\
                  sum('amount').alias('total amount'),\
                  sum(when(col("status") == "success", col("amount")).otherwise(0)).alias('successful amount'),\
                  sum(when(col("status") == "failed", 1).otherwise(0)).alias('failed count')\
              )

In [None]:
grouped_df.show()

+--------+------------+-----------------+------------+
|customer|total amount|successful amount|failed count|
+--------+------------+-----------------+------------+
|     101|         700|              500|           1|
|     102|         800|              800|           0|
+--------+------------+-----------------+------------+



In [None]:
df.fillna(0, subset=['amount']).show()

+------+--------+-------+------+
|amount|customer| status|txn_id|
+------+--------+-------+------+
|   500|     101|success|     1|
|   200|     101| failed|     2|
|   800|     102|success|     3|
+------+--------+-------+------+



In [None]:
df = df.drop(col('_corrupt_record'))

In [None]:
grouped_df.show()

+--------+------------+-----------------+------------+
|customer|total amount|successful amount|failed count|
+--------+------------+-----------------+------------+
|     101|         700|              500|           1|
|     102|         800|              800|           0|
+--------+------------+-----------------+------------+



In [None]:
grouped_df.write.mode("overwrite").parquet("grouped_df.parquet")


In [None]:
spark.read.parquet("grouped_df.parquet").show()

+--------+------------+-----------------+------------+
|customer|total amount|successful amount|failed count|
+--------+------------+-----------------+------------+
|     101|         700|              500|           1|
|     102|         800|              800|           0|
+--------+------------+-----------------+------------+



In [None]:
df.createOrReplaceTempView("df_view")
spark.sql("select * from df_view").show()

+------+--------+-------+------+
|amount|customer| status|txn_id|
+------+--------+-------+------+
|   500|     101|success|     1|
|   200|     101| failed|     2|
|   800|     102|success|     3|
+------+--------+-------+------+



In [None]:
spark.sql('''select customer
          from df_view
          where status = 'failed'
          group by customer
          having count(*) > 3''').show()

+--------+
|customer|
+--------+
+--------+



In [None]:
spark.sql('''select max(amount) as second_high_salary
          from df_view
          where amount < (select max(amount)
          from df_view)''').show()

+------------------+
|second_high_salary|
+------------------+
|               500|
+------------------+



In [None]:
error_count = 0
with open('log.txt', 'r') as f:
    for line in f:
        if "ERROR" in line:
            error_count += 1
print(f"Number of lines containing 'ERROR': {error_count}")

Number of lines containing 'ERROR': 4


In [None]:
# Create a file with specific parameters
with open(
    'my_example_file.txt',
    mode='w+',
    encoding='utf-8',
    errors='replace',
    newline='',
    buffering=1
) as f:
    f.write('Hello, world!\n')
    f.write('This is another line.\n')
    f.write('Here are some special characters: \u03B1\u03B2\u03B3\n') # Greek letters

    # Seek to the beginning of the file to read
    f.seek(0)
    content = f.read()
    print("File content:\n" + content)

# Example of print() with 'sep' and 'end'
print("These", "are", "separated", "by", "-", sep='-', end='!\n')
print("This is on the same line because of the previous 'end' parameter.")

File content:
Hello, world!
This is another line.
Here are some special characters: αβγ

These-are-separated-by--!
This is on the same line because of the previous 'end' parameter.


In [None]:
import pandas as pd
import io

# Read the string data into a pandas DataFrame
df_from_string = pd.read_csv(io.StringIO(data_string))

# Save the DataFrame to a CSV file
df_from_string.to_csv('transactions_new.csv', index=False)

print("Data saved to 'transactions_new.csv'")

In [None]:
# Display the first few rows of the new DataFrame to confirm
display(df_from_string.head())

In [15]:
df = spark.read.format('csv')\
     .option("inferSchema", "true")\
     .option("header", "true")\
     .load('/content/sample_data/transactions_20rows.csv')

In [14]:
df.show()

+------+-----------+------+-------+------------+-------------------+
|txn_id|customer_id|amount| status|payment_mode|          timestamp|
+------+-----------+------+-------+------------+-------------------+
|     1|        101|   500|success|         UPI|2025-01-12 10:23:11|
|     2|        101|  1200| failed|        Card|2025-01-12 10:24:55|
|     3|        102|   800|success|         UPI|2025-01-12 10:28:01|
|     4|        103|  1500| failed|         UPI|2025-01-12 10:30:11|
|     5|        102|   300|success|      Wallet|2025-01-13 11:10:22|
|     6|        101|   500|success|         UPI|2025-01-13 11:12:01|
|     7|        104|  2500| failed|        Card|2025-01-13 12:44:19|
|     8|        105|   900|success|         UPI|2025-01-14 09:01:41|
|     9|        105|   900|success|         UPI|2025-01-14 09:01:41|
|    10|        103|   600|success|      Wallet|2025-01-14 09:22:01|
|    11|        104|  2500| failed|        Card|2025-01-14 09:23:55|
|    12|        106|   700|success