In [None]:
from pyspark.sql.functions import col, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from tabulate import tabulate
import plotly.express as px
import pandas as pd
import re
from tabulate import tabulate

"""1. Visualization (Analysis Output)"""

def read_analysis_results(filename):
    with open(filename, "r", encoding="utf-8") as file:
        data = file.readlines()

    top_languages = []
    top_themes = []
    theme_language_data = []
    most_languages_theme = ""
    least_languages_theme = ""

    mode = None
    for line in data:
        line = line.strip()

        if line.startswith("Top 3 Languages Across All Themes:"):
            mode = "languages"
            continue
        elif line.startswith("Top 3 Themes:"):
            mode = "themes"
            continue
        elif line.startswith("Top 3 themes with the highest percentage of the Top 3 languages:"):
            mode = "theme_languages"
            continue
        elif line.startswith("Theme(s) with the Most Number of Languages:"):
            mode = "most_languages"
            continue
        elif line.startswith("Theme(s) with the Least Number of Languages:"):
            mode = "least_languages"
            continue

        if mode == "languages" and line.startswith("-"):
            top_languages.append([line.lstrip("- ")])
        elif mode == "themes" and line.startswith("-"):
            top_themes.append([line.lstrip("- ")])
        elif mode == "theme_languages" and line.startswith("Theme:"):
            current_theme = line.split(": ")[-1]
        elif mode == "theme_languages" and "Language:" in line:
            match = re.match(r"Language: (.*), Percentage: ([0-9.]+)%", line)
            if match:
                language, percentage = match.groups()
                theme_language_data.append([current_theme, language, float(percentage)])
        elif mode == "most_languages" and line.startswith("Theme:"):
            most_languages_theme = line
        elif mode == "least_languages" and line.startswith("Theme:"):
            least_languages_theme = line

    return top_languages, top_themes, theme_language_data, most_languages_theme, least_languages_theme

def merge_theme_rows(df):
    df["Theme"] = df["Theme"].mask(df.duplicated("Theme"), "")
    return df

def display_results():
    filename = "analysis_results.txt" ### UPDATE_FILE_PATH_HERE
    top_languages, top_themes, theme_language_data, most_languages_theme, least_languages_theme = read_analysis_results(filename)

    print("Top 3 Languages Across All Themes:")
    print(tabulate(top_languages, headers=["Language"], tablefmt="grid"))

    print("\nTop 3 Themes:")
    print(tabulate(top_themes, headers=["Theme"], tablefmt="grid"))

    print("\nTop 3 themes with the highest percentage of the Top 3 languages:")
    df_theme_lang = pd.DataFrame(theme_language_data, columns=["Theme", "Language", "Percentage"])
    df_theme_lang = merge_theme_rows(df_theme_lang)
    print(tabulate(df_theme_lang, headers="keys", tablefmt="grid", showindex=False))

    print("\n" + most_languages_theme)
    print(least_languages_theme)

if __name__ == "__main__":
    display_results()

In [None]:
"""2. Visualization (Language distribution of each theme)"""

parsed_df = pd.read_csv("output/part-00000-afe825b1-770c-4416-a851-88671f00ecd6-c000.csv")

# Define the schema for the exported CSV
schema = StructType([
    StructField("language", StringType(), True),
    StructField("theme", StringType(), True),
    StructField("count", IntegerType(), True)
])

exported_df = spark.read.csv("stored_output_df.csv", header=True, schema=schema)

# Group by Theme and aggregate counts
theme_groups = parsed_df.groupBy("theme").agg(_sum("count").alias("total_count"))

# Get list of themes
themes = [row.theme for row in theme_groups.select("theme").collect()]

# Create pie charts for each theme
for theme in themes:
    # Get the data for this theme
    theme_data = parsed_df.filter(col("theme") == theme) \
                         .select("language", "count") \
                         .toPandas()

    # Sort by count descending
    theme_data = theme_data.sort_values('count', ascending=False)

    if len(theme_data) > 5:  # Only group if more than 5 languages
        top5 = theme_data.head(5)
        others_row = pd.DataFrame({
            'language': ['Others'],
            'count': [theme_data['count'][5:].sum()]
        })
        theme_data = pd.concat([top5, others_row])

    # Create the pie chart
    fig = px.pie(theme_data,
                 values='count',
                 names='language',
                 title=f'Top Languages for "{theme}" Theme',
                 hover_data=['count'],
                 hole=0.3)

    fig.update_traces(
        textposition='inside',
        textinfo='percent+label',
        insidetextfont=dict(size=12, color='white'),
        hovertemplate="<b>%{label}</b><br>Count: %{value}<br>Percent: %{percent}"
    )

    fig.update_layout(
        uniformtext_minsize=10,
        uniformtext_mode='hide',
        height=600,
        showlegend=False
    )

    fig.show()




In [None]:
"""2. Visualization (Theme distribution)"""

total_counts = theme_groups.orderBy("total_count", ascending=False).toPandas()
fig = px.pie(total_counts,
             values='total_count',
             names='theme',
             title='Overall Theme Distribution',
             hover_data=['total_count'])

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    insidetextfont=dict(size=12, color='white'),
    hovertemplate="<b>%{label}</b><br>Total Count: %{value}<br>Percent: %{percent}"
)

fig.update_layout(
    height=700,
    showlegend=False
)

fig.show()

In [45]:
# Stop Spark
spark.stop()
