In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Covid Data Analysis") \
    .getOrCreate()


In [2]:
# Load the CSV file into a DataFrame
df = spark.read.format("csv") \
            .option("header", True) \
            .option("multiLine", True) \
            .option("ignoreLeadingWhiteSpace",True) \
            .option("ignoreTrailingWhiteSpace",True) \
            .option("escape", "\\") \
            .option("quote", "\"") \
            .load("complete.csv")



In [11]:
from pyspark.sql import types
df = df.withColumn("total_case", df["Total Confirmed cases"].cast(types.LongType()))
df = df.withColumn("total_newly_recovered", df["New recovered"].cast(types.LongType()))
df = df.withColumn("new_cases", df["New cases"].cast(types.LongType()))
df = df.withColumn("state", df["Name of State / UT"].cast(types.StringType()))
df = df.withColumn("death_Case", df["Death"].cast(types.LongType()))
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Name of State / UT: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Total Confirmed cases: string (nullable = true)
 |-- Death: string (nullable = true)
 |-- Cured/Discharged/Migrated: string (nullable = true)
 |-- New cases: string (nullable = true)
 |-- New deaths: string (nullable = true)
 |-- New recovered: string (nullable = true)
 |-- total_case: long (nullable = true)
 |-- total_newly_recovered: long (nullable = true)
 |-- state: string (nullable = true)
 |-- death_Case: long (nullable = true)
 |-- new_cases: long (nullable = true)



## 1. Convert All State Names to Lowercase

In [12]:
from pyspark.sql.functions import lower, col

output_df_1 = df.withColumn('state_lower', lower(col("state")))
output_df_1.select("state_lower").distinct().show()

+--------------------+
|         state_lower|
+--------------------+
|               delhi|
|         maharashtra|
|           meghalaya|
|              odisha|
|             haryana|
|         west bengal|
|                 goa|
|              punjab|
|   jammu and kashmir|
|dadra and nagar h...|
|           karnataka|
|      andhra pradesh|
|           telangana|
|            nagaland|
|               bihar|
|      madhya pradesh|
|           jharkhand|
|               assam|
|              kerala|
|          tamil nadu|
+--------------------+
only showing top 20 rows



## 2. Find the Day with the Greatest Number of COVID Cases

In [19]:
output_df_2 = df.groupBy("Date").sum("total_case").orderBy("sum(total_case)", ascending=False)
output_df_2.show(1)

+----------+---------------+
|      Date|sum(total_case)|
+----------+---------------+
|2020-08-06|        1964536|
+----------+---------------+
only showing top 1 row



## 3. Find the State with the Second-Largest Number of COVID Cases

In [20]:
df_grouped_by_state = df.groupBy("State").sum("total_case").orderBy("sum(total_case)", ascending=False)

second_largest_state = df_grouped_by_state.collect()[1]  # Get the second row
print(second_largest_state)


Row(State='Tamil Nadu', sum(total_case)=7847083)


## 4. Find the Union Territory with the Least Number of Deaths

In [21]:
df_territories = df.filter(df["state"].contains("Union Territory"))

df_least_deaths = df_territories.groupBy("state").sum("death_Case").orderBy("sum(death_Case)").show(1)


+--------------------+---------------+
|               state|sum(death_Case)|
+--------------------+---------------+
|Union Territory o...|              0|
+--------------------+---------------+
only showing top 1 row



## 5. Find the State with the Lowest Death to Total Confirmed Cases Ratio

In [22]:
from pyspark.sql.functions import col

df_ratio = df.withColumn("death_ratio", col("death_Case") / col("total_case"))

df_lowest_ratio = df_ratio.orderBy("death_ratio").select("state", "death_ratio").show(1)


+----------+-----------+
|     state|death_ratio|
+----------+-----------+
|Puducherry|       NULL|
+----------+-----------+
only showing top 1 row



## 6. Find the Month with the Most Newer Recovered Cases

In [26]:
from pyspark.sql.functions import month, col

# Extract month from the "Date" column
df_with_month = df.withColumn("month", month("Date"))

# Group by month and sum the newly recovered cases
df_grouped_by_month = df_with_month.groupBy("month").sum("total_newly_recovered")

# Order by the sum of newly recovered cases in descending order and select the top month
df_grouped_by_month = df_grouped_by_month.orderBy(col("sum(total_newly_recovered)").desc())
top_month = df_grouped_by_month.first()  # Get the top row

# Convert month number to month name
month_dict = {
    1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June",
    7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December"
}

if top_month:
    top_month_number = top_month["month"]
    top_month_name = month_dict.get(top_month_number, "Unknown")
    print(f"Month with the highest number of newly recovered cases: {top_month_name}")
else:
    print("No data available.")



Month with the highest number of newly recovered cases: July
