In [None]:
import pandas as pd
import os
import glob

In [None]:
# Define legend categories
legend_categories = {
    'Assault': ['Bangalore', 'Fuse', 'Ash', 'Mad Maggie', 'Ballistic'],
    'Skirmisher': ['Pathfinder', 'Wraith', 'Octane', 'Revenant', 'Horizon', 'Valkyrie','Alter'],
    'Recon': ['Bloodhound', 'Crypto', 'Seer', 'Vantage'],
    'Support': ['Gibraltar', 'Lifeline', 'Mirage', 'Loba', 'Newcastle', 'Conduit'],
    'Controller': ['Caustic', 'Wattson', 'Rampart', 'Catalyst']
}

In [None]:
# Flatten the dictionary to map legend to category
legend_to_category = {}
for category, legends in legend_categories.items():
    for legend in legends:
        legend_to_category[legend] = category


In [None]:
base_path = ""

damage_path = os.path.join(base_path, 'Legend Damage')
kills_path = os.path.join(base_path, 'Legend Kills')
matches_path = os.path.join(base_path, 'Legend Matches Played')
wins_path = os.path.join(base_path, 'Legend Wins')
# Function to list files in a directory
def list_files(directory):
    print(f"Files in {directory}:")
    for f in os.listdir(directory):
        print(f)

# List files in each directory
list_files(damage_path)
list_files(kills_path)
list_files(matches_path)
list_files(wins_path)

In [None]:
# Initialize an empty list to hold dataframes for each legend
legend_dataframes = []

# Get list of legends
legends = list(legend_to_category.keys())

In [None]:
#Loop over each legend
for legend in legends:
    try:
        # Build file paths for the legend
        legend_damage_file = os.path.join(damage_path, f"{legend}_damage.csv")
        legend_kills_file = os.path.join(kills_path, f"{legend}_kills.csv")
        legend_matches_file = os.path.join(matches_path, f"{legend}_games_played.csv")  # Updated suffix
        legend_wins_file = os.path.join(wins_path, f"{legend}_wins.csv")
        
        # Check if all files exist
        required_files = [legend_damage_file, legend_kills_file, legend_matches_file, legend_wins_file]
        missing_files = [f for f in required_files if not os.path.exists(f)]
        if missing_files:
            print(f"Data files for legend '{legend}' are missing: {missing_files}. Skipping this legend.")
            continue  # Skip to the next legend if any file is missing
        
        # Read the data files
        df_damage = pd.read_csv(legend_damage_file, header=None, names=['Damage'], skiprows=1)
        df_matches = pd.read_csv(legend_matches_file, header=None, names=['Games Played'], skiprows=1)
        df_wins = pd.read_csv(legend_wins_file, header=None, names=['Wins'], skiprows=1)

        # Custom converter to handle numeric values and ignore non-numeric ones
        def convert_number(s):
            try:
                # Remove commas and quotes, then convert to integer
                return int(str(s).replace(',', '').replace('"', '').strip())
            except ValueError:
                # Return NaN if conversion fails
                return pd.NA

        # Read the kills CSV file with the custom converter
        df_kills = pd.read_csv(legend_kills_file, header=None, names=['Kills'], converters={0: convert_number}, skiprows=1)

        # Combine the data into a single DataFrame, aligning on index (axis=1)
        df_legend = pd.concat([df_damage.reset_index(drop=True), 
                               df_kills.reset_index(drop=True), 
                               df_matches.reset_index(drop=True), 
                               df_wins.reset_index(drop=True)], axis=1)

        # Add 'legend_name' column to identify the legend in the combined DataFrame
        df_legend['legend_name'] = legend

        # Append the processed DataFrame to the list
        legend_dataframes.append(df_legend)
    
    except Exception as e:
        # Catch any exceptions during processing and print the error
        print(f"An error occurred while processing legend '{legend}': {e}")

In [None]:
# Concatenate all legend dataframes
all_legends_df = pd.concat(legend_dataframes, ignore_index=True)
# Map legends to categories
all_legends_df['Legend_Category'] = all_legends_df['legend_name'].map(legend_to_category)
# # List of legends with missing Wins data
legends_missing_wins = ['Ballistic', 'Conduit', 'Alter', 'Newcastle']
# Filter out legends with missing Wins data
all_legends_df = all_legends_df[~all_legends_df['legend_name'].isin(legends_missing_wins)]
print(all_legends_df)

In [None]:
# Ensure that columns are numeric, forcing invalid values to NaN
all_legends_df['Kills'] = pd.to_numeric(all_legends_df['Kills'], errors='coerce')
all_legends_df['Wins'] = pd.to_numeric(all_legends_df['Wins'], errors='coerce')
all_legends_df['Games Played'] = pd.to_numeric(all_legends_df['Games Played'], errors='coerce')
all_legends_df['Damage'] = pd.to_numeric(all_legends_df['Damage'], errors='coerce')

# Avoid division by zero and handle NaN
all_legends_df['Kills_per_Win'] = all_legends_df.apply(lambda row: row['Kills'] / row['Wins'] if pd.notna(row['Wins']) and row['Wins'] > 0 else 0, axis=1)
all_legends_df['Kills_per_Match'] = all_legends_df.apply(lambda row: row['Kills'] / row['Games Played'] if pd.notna(row['Games Played']) and row['Games Played'] > 0 else 0, axis=1)
all_legends_df['Damage_per_Match'] = all_legends_df.apply(lambda row: row['Damage'] / row['Games Played'] if pd.notna(row['Games Played']) and row['Games Played'] > 0 else 0, axis=1)

# Group by 'Legend_Category' and calculate the mean of the relevant columns
averaged_stats_df = all_legends_df.groupby('Legend_Category').agg(
    Average_Kills_per_Win=('Kills_per_Win', 'mean'),
    Average_Kills_per_Match=('Kills_per_Match', 'mean'),
    Average_Damage_per_Match=('Damage_per_Match', 'mean')
).reset_index()

# Display the aggregated statistics
print(averaged_stats_df)

# When it comes to wins Alter, Ballistic, Conduit and Newcastle do not have data of their wins. So we can't use them to calculate the win rate.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style
sns.set(style="whitegrid")

# Plot 1: Average Kills per Win by Legend Category
plt.figure(figsize=(10, 6))
sns.barplot(x='Legend_Category', y='Average_Kills_per_Win', data=averaged_stats_df, palette='Blues_d')
plt.title('Average Kills per Win by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Average Kills per Win')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plot 2: Average Kills per Match by Legend Category
plt.figure(figsize=(10, 6))
sns.barplot(x='Legend_Category', y='Average_Kills_per_Match', data=averaged_stats_df, palette='Oranges_d')
plt.title('Average Kills per Match by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Average Kills per Match')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plot 3: Average Damage per Match by Legend Category
plt.figure(figsize=(10, 6))
sns.barplot(x='Legend_Category', y='Average_Damage_per_Match', data=averaged_stats_df, palette='Greens_d')
plt.title('Average Damage per Match by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Average Damage per Match')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# List of legends with missing Wins data
legends_missing_wins = ['Ballistic', 'Conduit', 'Alter', 'Newcastle']

# Filter out these legends from the main DataFrame
filtered_legends_df = all_legends_df[~all_legends_df['legend_name'].isin(legends_missing_wins)]

# Verify the exclusion
print(filtered_legends_df['legend_name'].unique())


# Kruskal-Wallis test is a non-parametric test that is used to determine if there are statistically significant differences between the medians of three or more independent groups. It is the non-parametric version of the one-way ANOVA test.

In [None]:
from scipy import stats

# Groups for Kills per Match
groups_kpm_filtered = [group['Kills_per_Match'].dropna().values for name, group in filtered_legends_df.groupby('Legend_Category')]

# Kruskal-Wallis Test for Kills per Match
stat_kpm_filtered, p_kpm_filtered = stats.kruskal(*groups_kpm_filtered)
print(f"Kruskal-Wallis Test for Kills per Match: Stat={stat_kpm_filtered}, p-value={p_kpm_filtered}")

# Groups for Kills per Win
groups_kpw_filtered = [group['Kills_per_Win'].dropna().values for name, group in filtered_legends_df.groupby('Legend_Category')]

# Kruskal-Wallis Test for Kills per Win
stat_kpw_filtered, p_kpw_filtered = stats.kruskal(*groups_kpw_filtered)
print(f"Kruskal-Wallis Test for Kills per Win: Stat={stat_kpw_filtered}, p-value={p_kpw_filtered}")


# Dunn's Test is a post-hoc test that is used to determine which groups are significantly different from each other. It is used after the Kruskal-Wallis test.

In [None]:
import scikit_posthocs as sp

# Dunn's Test for Kills per Match
dunn_kpm_filtered = sp.posthoc_dunn(filtered_legends_df, val_col='Kills_per_Match', group_col='Legend_Category', p_adjust='bonferroni')
print("Dunn's Test for Kills per Match:")
print(dunn_kpm_filtered)

# Dunn's Test for Kills per Win
dunn_kpw_filtered = sp.posthoc_dunn(filtered_legends_df, val_col='Kills_per_Win', group_col='Legend_Category', p_adjust='bonferroni')
print("\nDunn's Test for Kills per Win:")
print(dunn_kpw_filtered)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate means and standard deviations for filtered data
averaged_stats_df_filtered = filtered_legends_df.groupby('Legend_Category').agg(
    Average_Kills_per_Match=('Kills_per_Match', 'mean'),
    Std_Kills_per_Match=('Kills_per_Match', 'std')
).reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(x='Legend_Category', y='Average_Kills_per_Match', data=averaged_stats_df_filtered,
            palette='Oranges_d', ci='sd')
plt.title('Average Kills per Match by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Average Kills per Match')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Box Plot for Kills per Match
plt.figure(figsize=(12, 6))
sns.boxplot(x='Legend_Category', y='Kills_per_Match', data=filtered_legends_df, palette='Set3')
plt.title('Distribution of Kills per Match by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Kills per Match')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Violin Plot for Kills per Win
plt.figure(figsize=(12, 6))
sns.violinplot(x='Legend_Category', y='Kills_per_Win', data=filtered_legends_df, palette='Set2', inner='quartile')
plt.title('Distribution of Kills per Win by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Kills per Win')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Heatmap for Dunn's Test p-values for Kills per Match
plt.figure(figsize=(10, 8))
sns.heatmap(dunn_kpm_filtered, annot=True, cmap='Reds', fmt=".3f", linewidths=.5)
plt.title("Dunn's Test p-values for Kills per Match")
plt.show()

# Heatmap for Dunn's Test p-values for Kills per Win
plt.figure(figsize=(10, 8))
sns.heatmap(dunn_kpw_filtered, annot=True, cmap='Reds', fmt=".3f", linewidths=.5)
plt.title("Dunn's Test p-values for Kills per Win")
plt.show()

# Scatter Plot with Clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Kills_per_Match', y='Kills_per_Win', hue='Cluster', data=filtered_legends_df, palette='Set1', s=100)
plt.title('Clusters of Legends Based on Kills per Match and Kills per Win')
plt.xlabel('Kills per Match')
plt.ylabel('Kills per Win')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()


# Cliff's Delta for Non-Parametric Data: It is a measure of effect size that is used to determine the magnitude of the difference between two groups. It is used after the Kruskal-Wallis test.

In [None]:
import pandas as pd
from itertools import combinations

# Assuming filtered_legends_df is already defined
legend_categories = filtered_legends_df['Legend_Category'].unique()

# Initialize a list to store each row as a dictionary
delta_results_kpm_list = []

for group1, group2 in combinations(legend_categories, 2):
    lst1 = filtered_legends_df[filtered_legends_df['Legend_Category'] == group1]['Kills_per_Match']
    lst2 = filtered_legends_df[filtered_legends_df['Legend_Category'] == group2]['Kills_per_Match']
    delta = cliffs_delta(lst1.values, lst2.values)
    
    # Append the result as a dictionary
    delta_results_kpm_list.append({
        'Group1': group1,
        'Group2': group2,
        'Cliffs_Delta': delta
    })

# Create the DataFrame once after the loop
delta_results_kpm = pd.DataFrame(delta_results_kpm_list)

print("Cliff's Delta for Kills per Match:")
print(delta_results_kpm)


In [None]:
delta_results_kpw_list = []
# Iterate over all unique pairs of legend categories
for group1, group2 in combinations(legend_categories, 2):
    # Extract the 'Kills_per_Win' data for each group
    lst1 = filtered_legends_df[filtered_legends_df['Legend_Category'] == group1]['Kills_per_Win']
    lst2 = filtered_legends_df[filtered_legends_df['Legend_Category'] == group2]['Kills_per_Win']
    
    # Calculate Cliff's Delta
    delta = cliffs_delta(lst1.values, lst2.values)
    
    # Append the result as a dictionary to the list
    delta_results_kpw_list.append({
        'Group1': group1,
        'Group2': group2,
        'Cliffs_Delta': delta
    })

# Create the DataFrame once after the loop
delta_results_kpw = pd.DataFrame(delta_results_kpw_list)

print("\nCliff's Delta for Kills per Win:")
print(delta_results_kpw)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Box Plot for Kills per Match
plt.figure(figsize=(12, 6))
sns.boxplot(x='Legend_Category', y='Kills_per_Match', data=filtered_legends_df, palette='Set3')
plt.title('Distribution of Kills per Match by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Kills per Match')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Box Plot for Kills per Win
plt.figure(figsize=(12, 6))
sns.boxplot(x='Legend_Category', y='Kills_per_Win', data=filtered_legends_df, palette='Set2')
plt.title('Distribution of Kills per Win by Legend Category')
plt.xlabel('Legend Category')
plt.ylabel('Kills per Win')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Correlation Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(filtered_legends_df[['Kills_per_Match', 'Damage_per_Match', 'Kills_per_Win']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Performance Metrics')
plt.show()


In [None]:
# Kills per Match vs. Damage per Match
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Kills_per_Match', y='Damage_per_Match', hue='Legend_Category', data=filtered_legends_df, palette='Set1')
plt.title('Kills per Match vs. Damage per Match by Legend Category')
plt.xlabel('Kills per Match')
plt.ylabel('Damage per Match')
plt.legend(title='Legend Category')
plt.tight_layout()
plt.show()


# ANOVA (Analysis of Variance)

# Post-hoc Tests (Tukey’s HSD)

In [None]:
# Tukey's HSD for Kills per Match
tukey_kpm = pairwise_tukeyhsd(endog=all_legends_df['Kills_per_Match'].dropna(),
                             groups=all_legends_df['Legend_Category'].dropna(),
                             alpha=0.05)
print(tukey_kpm)

# Tukey's HSD for Kills per Win
tukey_kpw = pairwise_tukeyhsd(endog=all_legends_df['Kills_per_Win'].dropna(),
                             groups=all_legends_df['Legend_Category'].dropna(),
                             alpha=0.05)
print(tukey_kpw)


# Assumption Checks:

# Normality:

# Use Shapiro-Wilk test or Q-Q plots to assess normality

In [None]:
from scipy.stats import shapiro

# Shapiro-Wilk Test for Kills per Match
stat_kpm, p_kpm = shapiro(all_legends_df['Kills_per_Match'].dropna())
print(f"Shapiro-Wilk Test for Kills per Match: Stat={stat_kpm}, p-value={p_kpm}")

# Shapiro-Wilk Test for Kills per Win
stat_kpw, p_kpw = shapiro(all_legends_df['Kills_per_Win'].dropna())
print(f"Shapiro-Wilk Test for Kills per Win: Stat={stat_kpw}, p-value={p_kpw}")


# Levene’s Test for Homogeneity of Variances:

In [None]:
from scipy.stats import levene

# Levene’s Test for Kills per Match
stat_levene_kpm, p_levene_kpm = levene(*groups_kpm)
print(f"Levene’s Test for Kills per Match: Stat={stat_levene_kpm}, p-value={p_levene_kpm}")

# Levene’s Test for Kills per Win
stat_levene_kpw, p_levene_kpw = levene(*groups_kpw)
print(f"Levene’s Test for Kills per Win: Stat={stat_levene_kpw}, p-value={p_levene_kpw}")


# Handling Violations

In [None]:
from scipy.stats import kruskal

# Kruskal-Wallis Test for Kills per Match
stat_kw_kpm, p_kw_kpm = kruskal(*groups_kpm)
print(f"Kruskal-Wallis Test for Kills per Match: Stat={stat_kw_kpm}, p-value={p_kw_kpm}")

# Kruskal-Wallis Test for Kills per Win
stat_kw_kpw, p_kw_kpw = kruskal(*groups_kpw)
print(f"Kruskal-Wallis Test for Kills per Win: Stat={stat_kw_kpw}, p-value={p_kw_kpw}")


# Stats per Legend