In [None]:
import pandas as pd

df = pd.read_csv("cluster_adverse_event.csv")

df_grouped = df.groupby("Cluster")["Adverse Events"].apply(lambda x: ";".join(x)).reset_index()

df_grouped.columns = ["cluster", "Adverse Events"]

print(df_grouped.head())

df_grouped.to_csv("cluster_ae_combined.csv", index=False)


In [None]:

df_cluster = pd.read_csv("cluster_adverse_event.csv")  
df_count = pd.read_csv("adverse_events_count_ge_5.csv")  

df_exploded = df_cluster.set_index("Cluster")["Adverse Events"].str.split(";").explode().reset_index()
df_exploded.rename(columns={"Adverse Events": "Adverse Event"}, inplace=True)

df_exploded["Adverse Event"] = df_exploded["Adverse Event"].str.strip()
df_count["Adverse Event"] = df_count["Adverse Event"].str.strip()

df_merged = pd.merge(df_exploded, df_count, how="left", on="Adverse Event")

df_cluster_trend = df_merged.groupby("Cluster")["Count"].sum().reset_index()

print(df_cluster_trend.head())

df_cluster_trend.to_csv("cluster_trend_summary.csv", index=False)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("cluster_trend_summary.csv")

plt.figure(figsize=(12, 8))
bars = plt.barh(df["Cluster"], df["Count"], color="orange")
plt.xlabel("Total AE Count (Over 12 Years)")
plt.title("Total AE Occurrences per Cluster (12 Years)")

for bar in bars:
    width = bar.get_width()
    plt.text(width + 50, bar.get_y() + bar.get_height() / 2, str(width), va='center')

plt.tight_layout()
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.show()


In [None]:
import pandas as pd


df_classified = pd.read_csv('cluster_adverse_event.csv', encoding='ISO-8859-1')


df_ae_map = df_classified.assign(
    AE_Name=df_classified['Adverse Events'].str.split(';')
).explode('AE_Name')


df_ae_map['AE_Name'] = df_ae_map['AE_Name'].str.strip().str.lower()


df_raw = pd.read_csv('hiv_directly_related.csv', encoding='utf-8')

df_raw['Year'] = df_raw['FDA_DT'].astype(str).str[:4]


df_raw = df_raw[df_raw['Year'].astype(int).between(2013, 2024)]


df_expanded = df_raw.assign(
    AE_Name=df_raw['PT'].str.split(';')
).explode('AE_Name')


df_expanded['AE_Name'] = df_expanded['AE_Name'].str.strip().str.lower()


df_labeled = pd.merge(
    df_expanded[['Year', 'AE_Name']],
    df_ae_map[['Cluster', 'AE_Name']],
    on='AE_Name',
    how='inner' 
)


df_counts = df_labeled.groupby(['Cluster', 'Year']).size().reset_index(name='Count')

df_pivot = df_counts.pivot(index='Cluster', columns='Year', values='Count').fillna(0).astype(int)


df_pivot.to_csv('13_04_category_year_trend.csv')


print(df_pivot)


In [None]:
top_clusters = df_pivot.sum(axis=1).sort_values(ascending=False).head(10).index
df_top = df_pivot.loc[top_clusters]


plt.figure(figsize=(14, 7))
for cluster in df_top.index:
    plt.plot(df_top.columns, df_top.loc[cluster], label=cluster)

plt.xlabel('Year')
plt.ylabel('Event Count')
plt.title('Trend of Top 10 AE Categories Over Time')
plt.xticks(rotation=45)
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()




In [None]:
clusters_sorted = df_pivot.sum(axis=1).sort_values(ascending=False).index
df_sorted = df_pivot.loc[clusters_sorted]

group_size = 10
total_groups = math.ceil(len(df_sorted) / group_size)

for i in range(total_groups):
    start_idx = i * group_size
    end_idx = min((i + 1) * group_size, len(df_sorted))
    group_clusters = df_sorted.index[start_idx:end_idx]
    df_group = df_sorted.loc[group_clusters]

    plt.figure(figsize=(14, 7))
    for cluster in df_group.index:
        plt.plot(df_group.columns, df_group.loc[cluster], label=cluster)

    plt.xlabel('Year')
    plt.ylabel('Event Count')
    plt.title(f'Trend of AE Categories Over Time (Clusters {start_idx + 1}-{end_idx})')
    plt.xticks(rotation=45)
    plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


In [None]:
# Transpose so years become rows
df_yearly = df_pivot.T  # shape: (Year, Cluster)

# Normalize each row to 100%
df_yearly_percent = df_yearly.div(df_yearly.sum(axis=1), axis=0) * 100

# Plot 100% stacked bar chart
df_yearly_percent.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='tab20')

plt.xlabel('Year')
plt.ylabel('Percentage of Total AEs')
plt.title('AE Category Proportional Distribution by Year')
plt.xticks(rotation=45)
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Generate the % table
df_yearly_percent = df_pivot.T.div(df_pivot.T.sum(axis=1), axis=0) * 100

# Generate abbreviation dictionary
def abbreviate(name):
    return ''.join(word[0].upper() for word in name.split())

abbr_dict = {name: abbreviate(name) for name in df_yearly_percent.columns}

# Plot
ax = df_yearly_percent.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='tab20')

plt.xlabel('Year')
plt.ylabel('Percentage of Total AEs')
plt.title('AE Category Proportional Distribution by Year (Top 2 Labeled)')
plt.xticks(rotation=45)
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Annotate Top 2 clusters per year using abbreviation
for i, year in enumerate(df_yearly_percent.index):
    ydata = df_yearly_percent.loc[year]
    top2 = ydata.sort_values(ascending=False).head(2)

    cum_sum = 0
    for cluster in df_yearly_percent.columns:
        height = df_yearly_percent.loc[year, cluster]
        if cluster in top2.index:
            abbr = abbr_dict[cluster]
            percent = f'{height:.1f}%'
            y_center = cum_sum + height / 2
            weight = 'bold' if cluster == top2.index[0] else 'normal'
            ax.text(i, y_center, f'{abbr}\n{percent}', ha='center', va='center', fontsize=8, weight=weight)
        cum_sum += height

plt.show()




In [None]:

df_percent = df_pivot.div(df_pivot.sum(axis=0), axis=1) * 100


df_percent = df_percent.round(2)

df_percent.to_csv('13_04_category_year_percentage.csv')

print(df_percent)


In [None]:

clusters_sorted = df_percent.sum(axis=1).sort_values(ascending=False).index
df_sorted = df_percent.loc[clusters_sorted]

group_size = 10
total_groups = math.ceil(len(df_sorted) / group_size)

for i in range(total_groups):
    start_idx = i * group_size
    end_idx = min((i + 1) * group_size, len(df_sorted))
    group_clusters = df_sorted.index[start_idx:end_idx]
    df_group = df_sorted.loc[group_clusters]

    plt.figure(figsize=(14, 7))
    for cluster in df_group.index:
        plt.plot(df_group.columns, df_group.loc[cluster], marker='o', linewidth=2, label=cluster)

    plt.xlabel('Year')
    plt.ylabel('% of Total AEs')
    plt.title(f'AE Category Percentage Trends (Clusters {start_idx + 1}-{end_idx})')
    plt.xticks(rotation=45)

    ymax = df_group.max().max()
    plt.ylim(0, ymax * 1.1) 
    plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


In [None]:
categories_per_page = 6


all_clusters = df_percent.index.tolist()
total_clusters = len(all_clusters)
total_pages = math.ceil(total_clusters / categories_per_page)

for page in range(total_pages):
    start = page * categories_per_page
    end = min(start + categories_per_page, total_clusters)
    clusters_subset = all_clusters[start:end]


    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 8))
    axes = axes.flatten()

    for i, cluster in enumerate(clusters_subset):
        ax = axes[i]
        ax.plot(df_percent.columns, df_percent.loc[cluster], marker='o', linewidth=2)
        ax.set_title(cluster, fontsize=10)
        ax.set_xlabel('Year')
        ax.set_ylabel('% of Total AEs')
        ax.set_xticks(df_percent.columns)
        ax.set_xticklabels(df_percent.columns, rotation=45)
        ax.set_ylim(0, df_percent.max().max()*1.1)  

  
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.suptitle(f'AE Category Trends (% per Year) - Page {page + 1}', fontsize=14, y=1.02)
    plt.show()


In [None]:

df_classified = pd.read_csv('cluster_adverse_event.csv', encoding='ISO-8859-1')
df_ae_map = df_classified.assign(
    AE_Name=df_classified['Adverse Events'].str.split(';')
).explode('AE_Name')
df_ae_map['AE_Name'] = df_ae_map['AE_Name'].str.strip().str.lower()


df_raw = pd.read_csv('hiv_directly_related.csv', encoding='utf-8')
df_raw['Year'] = df_raw['FDA_DT'].astype(str).str[:4]

df_raw = df_raw[df_raw['SEX'].isin(['F', 'M'])]
df_raw = df_raw[df_raw['Year'].astype(int).between(2013, 2024)]

def build_gender_pivot(df_gender_raw, gender_label):
    df_expanded = df_gender_raw.assign(
        AE_Name=df_gender_raw['PT'].str.split(';')
    ).explode('AE_Name')
    df_expanded['AE_Name'] = df_expanded['AE_Name'].str.strip().str.lower()

    df_labeled = pd.merge(
        df_expanded[['Year', 'AE_Name']],
        df_ae_map[['Cluster', 'AE_Name']],
        on='AE_Name',
        how='inner'
    )

    df_counts = df_labeled.groupby(['Cluster', 'Year']).size().reset_index(name='Count')
    df_pivot = df_counts.pivot(index='Cluster', columns='Year', values='Count').fillna(0).astype(int)

    df_pivot.to_csv(f'13_04_{gender_label}_category_year_trend.csv')
    return df_pivot

df_female = build_gender_pivot(df_raw[df_raw['SEX'] == 'F'], 'female')

df_male = build_gender_pivot(df_raw[df_raw['SEX'] == 'M'], 'male')

In [None]:
df_female = pd.read_csv('13_04_female_category_year_trend.csv', index_col=0)
df_male = pd.read_csv('13_04_male_category_year_trend.csv', index_col=0)

years = df_female.columns

bar_width = 0.4

for year in years:
    female_counts = df_female[year]
    male_counts = df_male[year]

    combined = (female_counts + male_counts)
    valid_clusters = combined[combined > 0].index

    valid_clusters = combined[valid_clusters].sort_values(ascending=False).index

    x = range(len(valid_clusters))

    plt.figure(figsize=(16, 6))
    plt.bar([i - bar_width/2 for i in x], male_counts[valid_clusters], width=bar_width, label='Male', color='skyblue')
    plt.bar([i + bar_width/2 for i in x], female_counts[valid_clusters], width=bar_width, label='Female', color='salmon')

    plt.xticks(ticks=x, labels=valid_clusters, rotation=45, ha='right')
    plt.ylabel('Event Count')
    plt.title(f'AE Category Gender Comparison in {year}')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
df_percent_male = df_male.div(df_male.sum(axis=0), axis=1) * 100
df_percent_female = df_female.div(df_female.sum(axis=0), axis=1) * 100

top_n = 10
combined_total = df_percent_male.add(df_percent_female, fill_value=0)
top_clusters = combined_total.sum(axis=1).sort_values(ascending=False).head(top_n).index

colors = cm.get_cmap('tab10', top_n) 

plt.figure(figsize=(16, 8))

for idx, cluster in enumerate(top_clusters):
    color = colors(idx)
    plt.plot(
        df_percent_male.columns,
        df_percent_male.loc[cluster],
        label=f'{cluster} - Male',
        linestyle='-',
        marker='o',
        color=color
    )
    plt.plot(
        df_percent_female.columns,
        df_percent_female.loc[cluster],
        label=f'{cluster} - Female',
        linestyle='--',
        marker='o',
        color=color
    )

plt.title('Top AE Categories - Gender Comparison (% per Year)', fontsize=14)
plt.xlabel('Year')
plt.ylabel('Percentage of Total AEs')
plt.xticks(rotation=45)
plt.legend(title='Cluster - Gender', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()



In [None]:
common_clusters = df_male.index.intersection(df_female.index)
common_years = df_male.columns.intersection(df_female.columns)

df_m = df_male.loc[common_clusters, common_years]
df_f = df_female.loc[common_clusters, common_years]

df_total = df_m + df_f
df_percent_male = df_m.div(df_total) * 100
df_percent_female = df_f.div(df_total) * 100

top_n = 20
combined_total = df_total.sum(axis=1)
top_clusters = combined_total.sort_values(ascending=False).head(top_n).index

cols = 3
rows = math.ceil(top_n / cols)
fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows), sharex=True, sharey=True)
axes = axes.flatten()
colors = cm.get_cmap('tab10', top_n)

for idx, cluster in enumerate(top_clusters):
    ax = axes[idx]
    color = colors(idx)

    ax.plot(df_percent_male.columns, df_percent_male.loc[cluster], label='Male', linestyle='-', marker='o', color=color)
    ax.plot(df_percent_female.columns, df_percent_female.loc[cluster], label='Female', linestyle='--', marker='o', color=color)

    ax.set_title(cluster, fontsize=11)
    ax.set_xticks(df_percent_male.columns)
    ax.set_xticklabels(df_percent_male.columns, rotation=45)
    ax.set_ylabel('% within Cluster')

    ax.legend(loc='upper right', fontsize=8)

for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle('Gender Ratio within AE Category (% per Year)', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

df_heatmap = df_percent_female.loc[top_clusters]

plt.figure(figsize=(14, 8))
sns.heatmap(df_heatmap, cmap="RdBu_r", center=50, annot=True, fmt=".1f", cbar_kws={"label": "% Female"})

plt.title("Female Proportion in AE Categories Over Time")
plt.xlabel("Year")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

df_heatmap = df_percent_male.loc[top_clusters]

plt.figure(figsize=(14, 8))
sns.heatmap(df_heatmap, cmap="RdBu_r", center=50, annot=True, fmt=".1f", cbar_kws={"label": "% Female"})

plt.title("Male Proportion in AE Categories Over Time")
plt.xlabel("Year")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()


In [None]:
df_raw = pd.read_csv('hiv_directly_related.csv')
df_cluster = pd.read_csv('cluster_ae_combined.csv')

df_cluster_expanded = df_cluster.assign(
    AE_Name=df_cluster['Adverse Events'].str.split(';')
).explode('AE_Name')
df_cluster_expanded['AE_Name'] = df_cluster_expanded['AE_Name'].str.strip().str.lower()

df_raw_expanded = df_raw.assign(
    AE_Name=df_raw['PT'].str.split(';')
).explode('AE_Name')
df_raw_expanded['AE_Name'] = df_raw_expanded['AE_Name'].str.strip().str.lower()

df_merged = pd.merge(
    df_raw_expanded[['AE_Name', 'AGE']],
    df_cluster_expanded[['AE_Name', 'cluster']],
    on='AE_Name',
    how='inner'
)

df_merged = df_merged.dropna(subset=['AGE'])
df_merged = df_merged[df_merged['AGE'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
df_merged['AGE'] = df_merged['AGE'].astype(float)
df_merged = df_merged[df_merged['AGE'] >= 0]

bins = [0, 9, 19, 29, 39, 49, 59, 69, 79, 120]
labels = ['0-9', '10-19', '20-29', '30-39', '40-49',
          '50-59', '60-69', '70-79', '80+']
df_merged['AGE_GROUP'] = pd.cut(df_merged['AGE'], bins=bins, labels=labels, right=True)

df_age_filtered = df_merged.dropna(subset=['AGE_GROUP'])

count_df = df_age_filtered.groupby(['cluster', 'AGE_GROUP']).size().reset_index(name='count')

total_per_cluster = count_df.groupby('cluster')['count'].transform('sum')

count_df['percentage'] = (count_df['count'] / total_per_cluster) * 100

count_df['percentage'] = count_df['percentage'].round(2)

print(count_df)


In [None]:
top_clusters = count_df.groupby('cluster')['count'].sum().sort_values(ascending=False).head(20).index
df_plot = count_df[count_df['cluster'].isin(top_clusters)]

age_groups = df_plot['AGE_GROUP'].cat.categories.tolist()

cols = 4
rows = math.ceil(len(top_clusters) / cols)
fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows), sharey=True)
axes = axes.flatten()
colors = cm.get_cmap('tab10', len(age_groups))

for idx, cluster in enumerate(top_clusters):
    ax = axes[idx]
    data = df_plot[df_plot['cluster'] == cluster]
    data = data.set_index('AGE_GROUP').reindex(age_groups)  
    ax.bar(age_groups, data['percentage'], color=colors(idx % 10))
    ax.set_title(cluster, fontsize=10)
    ax.set_xticklabels(age_groups, rotation=45)
    ax.set_ylabel('% within Cluster')

for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle('Age Group Distribution within Top AE Clusters (%)', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


In [None]:
heatmap_df = count_df[count_df['cluster'].isin(top_clusters)].pivot(
    index='cluster',
    columns='AGE_GROUP',
    values='percentage'
)

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(heatmap_df, annot=True, fmt=".1f", cmap="YlGnBu", cbar_kws={'label': '% within Cluster'})

plt.title('Age Distribution across Top AE Clusters (%)')
plt.xlabel('Age Group')
plt.ylabel('AE Cluster')
plt.tight_layout()
plt.show()
