In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.experimental import enable_iterative_imputer  # Enable IterativeImputer
from sklearn.impute import IterativeImputer  # Import IterativeImputer
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress iteration warnings for logistic regression convergence
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Define paths to the imputed datasets
imputed_dataset_paths = {
    "PC": r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Data Imputation\Imputated Datasets\PC',
    "PS4": r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Data Imputation\Imputated Datasets\PS',
    "Xbox": r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Data Imputation\Imputated Datasets\Xbox'
}

# Load the original CSV files into DataFrames (before imputation)
original_datasets = {
    "PC": pd.read_csv(r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Filtered Datasets\Legend_Stats_PC_Filtered.csv'),
    "PS4": pd.read_csv(r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Filtered Datasets\Legend_Stats_PS4_Filtered.csv'),
    "Xbox": pd.read_csv(r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Filtered Datasets\Legend_Stats_Xbox_Filtered.csv')
}

# Adjust paths by using double backslashes or raw strings to avoid invalid escape sequences
def save_imputed_data(df, save_path, dataset_name, method):
    """ Save the imputed data while ensuring valid file paths """
    df.to_csv(rf"{save_path}\{dataset_name}_{method.split('.')[0]}.csv", index=False)

# Function to perform imputation
def perform_imputation(df, dataset_name, save_path):
    print(f"Starting imputation processes for {dataset_name} dataset...")

    # Separate player names (or other non-numeric columns) and numeric data for imputation
    player_names = df['player_name']
    numeric_df = df.select_dtypes(include=[np.number])

    # Mode Imputation
    print(f"Performing Mode Imputation for {dataset_name}...")
    mode_imputer = SimpleImputer(strategy="most_frequent")
    df_mode_imputed = pd.DataFrame(mode_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_mode_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_mode_imputed, save_path, dataset_name, 'Legend_Stats_mode_imputed.csv')

    # Logistic Regression Imputation
    print(f"Performing Logistic Regression Imputation for {dataset_name}...")
    lr_imputer = IterativeImputer(estimator=LogisticRegression(solver='lbfgs', tol=1e-4, max_iter=5), random_state=42)
    df_lr_imputed = pd.DataFrame(lr_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_lr_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_lr_imputed, save_path, dataset_name, 'Legend_Stats_lr_imputed.csv')

    # Random Forest Imputation
    print(f"Performing Random Forest Imputation for {dataset_name}...")
    rf_imputer = IterativeImputer(estimator=RandomForestClassifier(n_estimators=10), random_state=42, max_iter=10)
    df_rf_imputed = pd.DataFrame(rf_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_rf_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_rf_imputed, save_path, dataset_name, 'Legend_Stats_rf_imputed.csv')

    # KNN Imputation
    print(f"Performing KNN Imputation for {dataset_name}...")
    knn_imputer = KNNImputer(n_neighbors=5)
    df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_knn_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_knn_imputed, save_path, dataset_name, 'Legend_Stats_knn_imputed.csv')

    # Mean Imputation
    print(f"Performing Mean Imputation for {dataset_name}...")
    mean_imputer = SimpleImputer(strategy='mean')
    df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_mean_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_mean_imputed, save_path, dataset_name, 'Legend_Stats_mean_imputed.csv')

    # Median Imputation
    print(f"Performing Median Imputation for {dataset_name}...")
    median_imputer = SimpleImputer(strategy='median')
    df_median_imputed = pd.DataFrame(median_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_median_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_median_imputed, save_path, dataset_name, 'Legend_Stats_median_imputed.csv')

    # XGBoost Imputation
    print(f"Performing XGBoost Imputation for {dataset_name}...")
    xgb_imputer = IterativeImputer(estimator=XGBRegressor(), random_state=42, max_iter=10)
    df_xgb_imputed = pd.DataFrame(xgb_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_xgb_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_xgb_imputed, save_path, dataset_name, 'Legend_Stats_xgb_imputed.csv')

    # MICE Imputation
    print(f"Performing MICE Imputation for {dataset_name}...")
    mice_imputer = IterativeImputer(max_iter=10, random_state=42)
    df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
    df_mice_imputed.insert(0, 'player_name', player_names)
    save_imputed_data(df_mice_imputed, save_path, dataset_name, 'Legend_Stats_mice_imputed.csv')

    print(f"Imputation completed and files saved for {dataset_name} dataset.")

# Perform imputation for each dataset (PC, PS4, Xbox)
for dataset_name, dataset_path in imputed_dataset_paths.items():
    dataset_df = original_datasets[dataset_name]  # Get the original dataset
    perform_imputation(dataset_df, dataset_name, dataset_path)


Starting imputation processes for PC dataset...
Performing Mode Imputation for PC...
Performing Logistic Regression Imputation for PC...
Performing Random Forest Imputation for PC...
Performing KNN Imputation for PC...
Performing Mean Imputation for PC...
Performing Median Imputation for PC...
Performing XGBoost Imputation for PC...
Performing MICE Imputation for PC...
Imputation completed and files saved for PC dataset.
Starting imputation processes for PS4 dataset...
Performing Mode Imputation for PS4...
Performing Logistic Regression Imputation for PS4...
Performing Random Forest Imputation for PS4...


KeyboardInterrupt: 

In [None]:
# Import necessary libraries
import pandas as pd

# Define paths to the imputed datasets
imputed_dataset_paths = {
    "PC": r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Data Imputation\Imputated Datasets\PC',
    "PS4": r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Data Imputation\Imputated Datasets\PS',
    "Xbox": r'C:\Users\altaa\Documents\GitHub\Apex-Legends-Research\Data_Retrieval\CSV_files\Stats Per Legend\Data Imputation\Imputated Datasets\Xbox'
}

# Imputation methods
imputation_methods = [
    'Legend_Stats_lr_imputed.csv',
    'Legend_Stats_rf_imputed.csv',
    'Legend_Stats_knn_imputed.csv',
    'Legend_Stats_mean_imputed.csv',
    'Legend_Stats_median_imputed.csv',
    'Legend_Stats_xgb_imputed.csv',
    'Legend_Stats_mice_imputed.csv'
]

# Dictionary to store correlation results
correlation_results = {}

# Function to compute correlation matrices for each imputed dataset
def compute_correlation(dataset_name, dataset_path):
    print(f"Computing correlation matrices for {dataset_name} dataset...")
    
    # Dictionary to store the correlation results for this dataset
    dataset_correlations = {}
    
    for method in imputation_methods:
        # Load the imputed dataset
        imputed_file_path = f'{dataset_path}\{dataset_name}_{method}'
        imputed_df = pd.read_csv(imputed_file_path)

        # Compute correlation matrix
        correlation_matrix = imputed_df.corr()
        
        # Store the correlation matrix
        dataset_correlations[method] = correlation_matrix

        # Save the correlation matrix as a CSV file for future reference
        correlation_matrix.to_csv(f'{dataset_path}\{dataset_name}_{method.split(".")[0]}_correlation.csv')
        
        # Print the correlation matrix for verification
        print(f"\nCorrelation matrix for {dataset_name} using {method.split('_')[2]} Imputation:")
        print(correlation_matrix)
    
    return dataset_correlations

# Perform correlation analysis for each dataset (PC, PS4, Xbox)
for dataset_name, dataset_path in imputed_dataset_paths.items():
    correlation_results[dataset_name] = compute_correlation(dataset_name, dataset_path)

print("Correlation matrices have been computed and saved.")
