In [None]:
!pip install pyspark,XlsxWriter

In [None]:
# Import necessary Spark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, collect_list, max
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Create a Spark session
spark = SparkSession.builder.appName('project').getOrCreate()

# Load the dataset
df = spark.read.option('inferschema', 'true').option('header', 'true').csv('/content/2020_olympics_data.csv')

# Function to calculate medal counts for a given group column
def result(df, group_col, partition_col=None, rank_limit=None):
    # Define the aggregation columns for medal counts
    agg_columns = [
        count(when(df['Medal'] == 'Gold', True)).alias('Gold'),
        count(when(df['Medal'] == 'Silver', True)).alias('Silver'),
        count(when(df['Medal'] == 'Bronze', True)).alias('Bronze'),
        count(when(df['Medal'] != 'None', True)).alias('Total'),
    ]

    # Create a window specification for ranking
    window_spec = Window.partitionBy(partition_col).orderBy(
        col("Gold").desc(), col("Silver").desc(), col("Bronze").desc(), col("Total").desc()
    ) if partition_col else Window.orderBy(
        col("Gold").desc(), col("Silver").desc(), col("Bronze").desc(), col("Total").desc()
    )

    # If both group_col and partition_col are provided, aggregate by both columns
    if group_col and partition_col:
        result_df = df.groupBy(partition_col, group_col).agg(*agg_columns).filter(col('Total') > 0)
    # If only group_col is provided, aggregate by that column
    elif group_col:
        result_df = df.groupBy(group_col).agg(*agg_columns).filter(col('Total') > 0)

    # If the group column is "Name," also collect the list of events
    if group_col == "Name":
        events_df = df.filter(df['Medal'] != 'None').groupBy(group_col).agg(
            collect_list("Event").alias('Event')
        )
        result_df = result_df.join(events_df, [group_col], "left")

    # Calculate the rank for each group
    result_df = result_df.withColumn("Rank", F.row_number().over(window_spec))

    # If rank_limit is provided, filter rows by rank
    if rank_limit:
        result_df = result_df.filter(col('Rank') <= rank_limit)

    # Show the result
    result_df.show()
    return result_df

# Perform different analyses using the defined functions
# 1. Medal Count for countries
result_df1=result(df, 'Country')

# 2. Medal Count of Top Countries in Each Sport
result_df2=result(df, 'Country', 'Sport')

# 3. Medal Count of Top Sport Performances in Each Country
result_df3=result(df, 'Sport', 'Country')

# 4. Top  Athletes
result_df4=result(df, 'Name')

# 5. Details of Top Athletes in Each Country along with countries with no medals
result_df5=result(df, 'Name', 'Country')


In [None]:
import pandas as pd
import xlsxwriter

# Define a function to export multiple DataFrames to an Excel file with different worksheets
def export_to_excel(dataframes, filename):
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        for sheet_name, df in dataframes.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

pandas_df1 = result_df1.toPandas()
print(pandas_df1)
pandas_df2 = result_df2.toPandas()
pandas_df3 = result_df3.toPandas()
pandas_df4 = result_df4.toPandas()
pandas_df5 = result_df5.toPandas()

# Example usage:
# Create a dictionary with sheet names as keys and DataFrames as values
dataframes = {
    'MedalCountsByCountry': pandas_df1,
    'TopCountriesInSport': pandas_df2,  # Modified sheet name
    'TopSportsInEachCountry': pandas_df3,  # Modified sheet name
    'Top10Athletes': pandas_df4,
    'TopAthletesInEachCountry': pandas_df5  # Modified sheet name
}

# Specify the Excel file name
excel_filename = '/content/olympic_report.xlsx'

# Export the DataFrames to the Excel file with different worksheets
export_to_excel(dataframes, excel_filename)
