<a href="https://colab.research.google.com/github/AsmitaOjha/Internship_ExtensoData_Work/blob/main/Task4_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('Task4').getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/My Drive/ExtensoInternship/Txn_data.csv'

df = spark.read.csv(file_path, header=True, inferSchema=True)

df.show(5)  # Show first 5 rows
df.printSchema()  # Print schema to verify data types


+---+-------------------+---------------+-------------+-------+
|_c0|          Date/Time|From_Account_id|To_Account_id| amount|
+---+-------------------+---------------+-------------+-------+
|  0|2024-06-15 00:00:02|        9410161|      1191872| 1000.0|
|  1|2024-06-15 00:00:02|        9781224|      5590231|10000.0|
|  2|2024-06-15 00:00:02|        4231628|      2677870|  140.0|
|  3|2024-06-15 00:00:05|        1339396|       869200| 1000.0|
|  4|2024-06-15 00:00:07|        3531317|      9053549| 5000.0|
+---+-------------------+---------------+-------------+-------+
only showing top 5 rows

root
 |-- _c0: integer (nullable = true)
 |-- Date/Time: timestamp (nullable = true)
 |-- From_Account_id: string (nullable = true)
 |-- To_Account_id: string (nullable = true)
 |-- amount: double (nullable = true)



In [None]:
df.describe().show()

+-------+--------------------+-----------------+------------------+------------------+
|summary|                 _c0|  From_Account_id|     To_Account_id|            amount|
+-------+--------------------+-----------------+------------------+------------------+
|  count|            63690622|         63690622|          63690622|          63690622|
|   mean|3.1845250500056524E7|5499325.684672919| 5502083.598508284|1444.1673131794812|
| stddev|1.8385899022516806E7|3123135.499884011|3163919.6020803107| 4281.398229109791|
|    min|                   0|         10000002|          10000002|               1.0|
|    max|            63690561|          9999994|           9999998|       1.1394027E7|
+-------+--------------------+-----------------+------------------+------------------+



In [None]:
df = df.drop('_c0')


In [None]:
df.printSchema()

root
 |-- Date/Time: timestamp (nullable = true)
 |-- From_Account_id: string (nullable = true)
 |-- To_Account_id: string (nullable = true)
 |-- amount: double (nullable = true)



In [None]:
from pyspark.sql.functions import min, max

time_range = df.agg(
    min("Date/Time").alias("Min_DateTime"),
    max("Date/Time").alias("Max_DateTime")
).collect()[0]

print(f"Transaction data ranges from {time_range['Min_DateTime']} to {time_range['Max_DateTime']}")


Transaction data ranges from 2024-06-15 00:00:02 to 2025-11-19 09:12:12


In [None]:
from pyspark.sql.functions import month, dayofmonth, hour, dayofweek

df_time = df.withColumn("Month", month("Date/Time")) \
            .withColumn("Day", dayofmonth("Date/Time")) \
            .withColumn("Hour", hour("Date/Time")) \
            .withColumn("DayOfWeek", dayofweek("Date/Time"))


In [None]:
from pyspark.sql.functions import desc, asc

# Function to find max and min transaction counts for a given column
def max_min_transactions(df, col_name):
    agg_df = df.groupBy(col_name).count()
    max_row = agg_df.orderBy(desc("count")).first()
    min_row = agg_df.orderBy(asc("count")).first()
    return max_row, min_row

# Month
max_month, min_month = max_min_transactions(df_time, "Month")
print(f"Most transactions in month: {max_month['Month']} with {max_month['count']} transactions")
print(f"Least transactions in month: {min_month['Month']} with {min_month['count']} transactions")

# Day of month
max_day, min_day = max_min_transactions(df_time, "Day")
print(f"Most transactions on day: {max_day['Day']} with {max_day['count']} transactions")
print(f"Least transactions on day: {min_day['Day']} with {min_day['count']} transactions")

# Hour
max_hour, min_hour = max_min_transactions(df_time, "Hour")
print(f"Most transactions in hour: {max_hour['Hour']} with {max_hour['count']} transactions")
print(f"Least transactions in hour: {min_hour['Hour']} with {min_hour['count']} transactions")

# Day of week
max_dow, min_dow = max_min_transactions(df_time, "DayOfWeek")
print(f"Most transactions on day of week: {max_dow['DayOfWeek']} with {max_dow['count']} transactions")
print(f"Least transactions on day of week: {min_dow['DayOfWeek']} with {min_dow['count']} transactions")


Most transactions in month: 9 with 10820506 transactions
Least transactions in month: 6 with 4961951 transactions
Most transactions on day: 5 with 2261142 transactions
Least transactions on day: 31 with 942036 transactions
Most transactions in hour: 18 with 5950631 transactions
Least transactions in hour: 4 with 103695 transactions
Most transactions on day of week: 6 with 9267286 transactions
Least transactions on day of week: 7 with 8705421 transactions


In [None]:
senders = df.select("From_Account_id").distinct()
receivers = df.select("To_Account_id").distinct()

both = senders.join(receivers, senders.From_Account_id == receivers.To_Account_id, "inner")

print(f"Number of accounts appearing both as sender and receiver: {both.count()}")


Number of accounts appearing both as sender and receiver: 2312086


In [None]:
all_accounts = senders.union(receivers).distinct()

# Count total unique accounts
total_unique_accounts = all_accounts.count()

print(f"Total unique accounts in dataset: {total_unique_accounts}")

Total unique accounts in dataset: 3111701


In [None]:
only_senders = senders.join(receivers, senders.From_Account_id == receivers.To_Account_id, "left_anti")

print(f"Number of accounts appearing only as sender: {only_senders.count()}")


Number of accounts appearing only as sender: 314332


In [None]:
only_receivers = receivers.join(senders, receivers.To_Account_id == senders.From_Account_id, "left_anti")

print(f"Number of accounts appearing only as receiver: {only_receivers.count()}")


Number of accounts appearing only as receiver: 485283


In [None]:
df.describe("amount").show()


+-------+------------------+
|summary|            amount|
+-------+------------------+
|  count|          63690622|
|   mean|1444.1673131794812|
| stddev| 4281.398229109791|
|    min|               1.0|
|    max|       1.1394027E7|
+-------+------------------+



Average transaction amount sent per account

In [None]:
from pyspark.sql.functions import avg

avg_sent = df.groupBy("From_Account_id") \
             .agg(avg("amount").alias("Avg_Amount_Sent")) \
             .orderBy("Avg_Amount_Sent", ascending=False)

avg_sent.show(5)


+---------------+------------------+
|From_Account_id|   Avg_Amount_Sent|
+---------------+------------------+
|              5|3251502.9483333332|
|         615L21|         1500000.0|
|         625L21|         1000000.0|
|         435L21|          800000.0|
|         565L21|          575000.0|
+---------------+------------------+
only showing top 5 rows



Total amount sent per account

In [None]:
from pyspark.sql.functions import sum

total_sent = df.groupBy("From_Account_id") \
               .agg(sum("amount").alias("Total_Amount_Sent")) \
               .orderBy("Total_Amount_Sent", ascending=False)

total_sent.show(5)


+---------------+--------------------+
|From_Account_id|   Total_Amount_Sent|
+---------------+--------------------+
|        4248684|3.5398504735999995E8|
|              5|       5.852705307E7|
|         337555|         3.5915678E7|
|        6879977|       3.034232922E7|
|         227824|         3.0078477E7|
+---------------+--------------------+
only showing top 5 rows



Total amount received per account

In [None]:
total_received = df.groupBy("To_Account_id") \
                   .agg(sum("amount").alias("Total_Amount_Received")) \
                   .orderBy("Total_Amount_Received", ascending=False)

total_received.show(5)


+-------------+---------------------+
|To_Account_id|Total_Amount_Received|
+-------------+---------------------+
|       513553|       2.1818730096E8|
|       165607|          5.4755214E7|
|      6879977|         4.27353813E7|
|     10583578|          2.8461452E7|
|      4890069|         2.72169795E7|
+-------------+---------------------+
only showing top 5 rows



Top accounts with most transactions

In [28]:
from pyspark.sql.functions import count, col

# Count sent transactions
sent_counts = df.groupBy("From_Account_id") \
                .agg(count("*").alias("Sent_Count"))

# Count received transactions
received_counts = df.groupBy("To_Account_id") \
                    .agg(count("*").alias("Received_Count"))

# Rename columns for union
sent_counts = sent_counts.withColumnRenamed("From_Account_id", "Account")
received_counts = received_counts.withColumnRenamed("To_Account_id", "Account")

# Join and sum counts per account
all_counts = sent_counts.join(received_counts, "Account", "full_outer") \
                        .na.fill(0) \
                        .withColumn("Total_Transactions",
                                    col("Sent_Count") + col("Received_Count")) \
                        .orderBy(col("Total_Transactions").desc())

all_counts.show(5)


+-------+----------+--------------+------------------+
|Account|Sent_Count|Received_Count|Total_Transactions|
+-------+----------+--------------+------------------+
|4248684|     15798|           395|             16193|
| 513553|       298|          8934|              9232|
|6879977|      3023|          4289|              7312|
| 227824|      3908|          2546|              6454|
| 686464|      1263|          4771|              6034|
+-------+----------+--------------+------------------+
only showing top 5 rows



Most frequent occuring transaction amount

In [29]:
from pyspark.sql.functions import mode

# Compute the mode of the 'amount' column
mode_df = df.select(mode("amount").alias("Most_Frequent_Amount"))

mode_df.show()


+--------------------+
|Most_Frequent_Amount|
+--------------------+
|               100.0|
+--------------------+

