# Data Preprocessing

In [2]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
import lightgbm as lgb
import numpy as np
import pandas as pd
import warnings
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Data Import

In [3]:
Career_Stats_df = pd.read_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/CSV_files/Career_Stats.csv', delimiter=',')

# Preprocessing

In [4]:
Career_Stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   player_name     1949 non-null   object
 1   career_kills    1949 non-null   object
 2   career_wins     1949 non-null   object
 3   career_revives  1949 non-null   object
dtypes: object(4)
memory usage: 61.0+ KB


# Converting Objects into Numerical Values

In [27]:
# Convert the relevant columns to numeric
Career_Stats_df['career_kills'] = pd.to_numeric(Career_Stats_df['career_kills'], errors='coerce')
Career_Stats_df['career_wins'] = pd.to_numeric(Career_Stats_df['career_wins'], errors='coerce')
Career_Stats_df['career_revives'] = pd.to_numeric(Career_Stats_df['career_revives'], errors='coerce')
#Randomize data
Career_Stats_df = Career_Stats_df.sample(frac=1).reset_index(drop=True)
# Print the data types to verify the changes
Career_Stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   player_name     1949 non-null   object 
 1   career_kills    1904 non-null   float64
 2   career_wins     973 non-null    float64
 3   career_revives  896 non-null    float64
dtypes: float64(3), object(1)
memory usage: 61.0+ KB


In [6]:
df_report = ProfileReport(Career_Stats_df, minimal=True)
df_report.to_file(output_file='Career_Stats.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Drop Unnecessary Columns

In [28]:
# Replace 0s with NaN to treat them as missing values
Career_Stats_df.replace(0, np.nan, inplace=True)

# Drop the player_name column
player_names = Career_Stats_df['player_name']
numeric_df = Career_Stats_df.drop(columns=['player_name'])

# Data Imputation(filling in missing values using different techniques)

# Auto Encoder

In [23]:
# Fill initial missing values with column mean for training
numeric_df_imputed = numeric_df.fillna(numeric_df.mean())

# Define an autoencoder model
def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    encoded = Dense(16, activation='relu')(encoded)
    decoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(decoded)
    output_layer = Dense(input_dim, activation='linear')(decoded)
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    return autoencoder

# Prepare the data for the autoencoder
X_train_ae = numeric_df_imputed.values

# Build and train the autoencoder
input_dim = X_train_ae.shape[1]
autoencoder = build_autoencoder(input_dim)
autoencoder.fit(X_train_ae, X_train_ae, epochs=100, batch_size=32, validation_split=0.2, verbose=2)

# Use the trained autoencoder to fill missing values
encoded_data = autoencoder.predict(X_train_ae)

# Create the imputed dataframe
df_autoencoder_imputed = pd.DataFrame(encoded_data, columns=numeric_df.columns)
df_autoencoder_imputed.insert(0, 'player_name', player_names)

# Save the imputed dataframe to a CSV file
df_autoencoder_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_autoencoder_imputed.csv', index=False)

print("Imputed dataset saved to 'Data_Retrieval/CSV_files/Career_Stats_autoencoder_imputed.csv'")

Epoch 1/100
49/49 - 2s - 35ms/step - loss: 2140393344.0000 - val_loss: 96302120.0000
Epoch 2/100
49/49 - 0s - 2ms/step - loss: 29485300.0000 - val_loss: 6191005.5000
Epoch 3/100
49/49 - 0s - 2ms/step - loss: 8587297.0000 - val_loss: 8043790.5000
Epoch 4/100
49/49 - 0s - 2ms/step - loss: 7987022.5000 - val_loss: 6086750.5000
Epoch 5/100
49/49 - 0s - 2ms/step - loss: 7434284.0000 - val_loss: 6652602.0000
Epoch 6/100
49/49 - 0s - 2ms/step - loss: 6542262.5000 - val_loss: 5193589.5000
Epoch 7/100
49/49 - 0s - 2ms/step - loss: 5649806.5000 - val_loss: 4374761.0000
Epoch 8/100
49/49 - 0s - 2ms/step - loss: 3770512.5000 - val_loss: 991496.3125
Epoch 9/100
49/49 - 0s - 2ms/step - loss: 1071908.3750 - val_loss: 582182.6875
Epoch 10/100
49/49 - 0s - 2ms/step - loss: 852976.1250 - val_loss: 585525.9375
Epoch 11/100
49/49 - 0s - 2ms/step - loss: 793351.2500 - val_loss: 581620.0000
Epoch 12/100
49/49 - 0s - 2ms/step - loss: 755521.6875 - val_loss: 531897.3750
Epoch 13/100
49/49 - 0s - 2ms/step - lo

In [29]:
# Suppress iteration warnings for logistic regression convergence
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

print("Starting imputation processes...")

# Mode Imputation
print("Performing Mode Imputation...")
mode_imputer = SimpleImputer(strategy="most_frequent")
df_mode_imputed = pd.DataFrame(mode_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_mode_imputed.insert(0, 'player_name', player_names)
print("Mode Imputation completed.")

# Logistic Regression Imputation
print("Performing Logistic Regression Imputation...")
lr_imputer = IterativeImputer(estimator=LogisticRegression(solver='lbfgs'), random_state=42, max_iter=10, verbose=2)
df_lr_imputed = pd.DataFrame(lr_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_lr_imputed.insert(0, 'player_name', player_names)
print("Logistic Regression Imputation completed.")

# Random Forest Imputation
print("Performing Random Forest Imputation...")
rf_imputer = IterativeImputer(estimator=RandomForestClassifier(n_estimators=10), random_state=42, max_iter=10, verbose=2)
df_rf_imputed = pd.DataFrame(rf_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_rf_imputed.insert(0, 'player_name', player_names)
print("Random Forest Imputation completed.")

# KNN Imputation
print("Performing KNN Imputation...")
knn_imputer = KNNImputer(n_neighbors=5)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_knn_imputed.insert(0, 'player_name', player_names)
print("KNN Imputation completed.")

# Mean Imputation
print("Performing Mean Imputation...")
mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_mean_imputed.insert(0, 'player_name', player_names)
print("Mean Imputation completed.")

# Median Imputation
print("Performing Median Imputation...")
median_imputer = SimpleImputer(strategy='median')
df_median_imputed = pd.DataFrame(median_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_median_imputed.insert(0, 'player_name', player_names)
print("Median Imputation completed.")

# XGBoost Imputation
print("Performing XGBoost Imputation...")
xgb_imputer = IterativeImputer(estimator=XGBRegressor(), random_state=42, max_iter=10, verbose=2)
df_xgb_imputed = pd.DataFrame(xgb_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_xgb_imputed.insert(0, 'player_name', player_names)
print("XGBoost Imputation completed.")

# MICE Imputation
print("Performing MICE Imputation...")
mice_imputer = IterativeImputer(max_iter=10, random_state=42)
df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(numeric_df), columns=numeric_df.columns)
df_mice_imputed.insert(0, 'player_name', player_names)
print("MICE Imputation completed.")

# Dictionary of imputed dataframes
imputed_dataframes = {
    "Mode Imputation": df_mode_imputed,
    "Logistic Regression Imputation": df_lr_imputed,
    "Random Forest Imputation": df_rf_imputed,
    "KNN Imputation": df_knn_imputed,
    "Mean Imputation": df_mean_imputed,
    "Median Imputation": df_median_imputed,
    "XGBoost Imputation": df_xgb_imputed,
    "MICE Imputation": df_mice_imputed
}

# Check that there are no missing values
print("\nVerifying no missing values after imputation:")
for name, df in imputed_dataframes.items():
    missing = df.isnull().sum().sum()
    print(f"{name}: {missing} missing values")

print("\nOriginal dataframe missing values (should be unchanged):")
print(Career_Stats_df.isnull().sum())

# Optionally save the imputed dataframes to CSV files
df_mode_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_mode_imputed.csv', index=False)
df_lr_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_lr_imputed.csv', index=False)
df_rf_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_rf_imputed.csv', index=False)
df_knn_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_knn_imputed.csv', index=False)
df_mean_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_mean_imputed.csv', index=False)
df_median_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_median_imputed.csv', index=False)
df_xgb_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_xgb_imputed.csv', index=False)
df_mice_imputed.to_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_mice_imputed.csv', index=False)

Starting imputation processes...
Performing Mode Imputation...
Mode Imputation completed.
Performing Logistic Regression Imputation...
[IterativeImputer] Completing matrix with shape (1949, 3)
[IterativeImputer] Ending imputation round 1/10, elapsed time 11.60
[IterativeImputer] Change: 51138.98274505395, scaled tolerance: 371.099 
[IterativeImputer] Ending imputation round 2/10, elapsed time 24.02
[IterativeImputer] Change: 50193.0, scaled tolerance: 371.099 
[IterativeImputer] Ending imputation round 3/10, elapsed time 36.15
[IterativeImputer] Change: 49867.0, scaled tolerance: 371.099 
[IterativeImputer] Ending imputation round 4/10, elapsed time 49.74
[IterativeImputer] Change: 54446.0, scaled tolerance: 371.099 
[IterativeImputer] Ending imputation round 5/10, elapsed time 61.86
[IterativeImputer] Change: 69801.0, scaled tolerance: 371.099 
[IterativeImputer] Ending imputation round 6/10, elapsed time 74.59
[IterativeImputer] Change: 57528.0, scaled tolerance: 371.099 
[IterativeI

# Training the model on each imputed dataset to see which dataset is the best and running for each stat as the target variable

In [30]:

# Define the regression models to be used
models = {
    "Random Forest": RandomForestRegressor(),
    "Extra Trees Regressor": ExtraTreesRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Hist Gradient Boosting": HistGradientBoostingRegressor(),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Support Vector Regressor": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "LightGBM Regressor": lgb.LGBMRegressor(),
    "XGBoost Regressor": XGBRegressor()
}

# Function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

# Function to run model prediction
def run_model_prediction(target_column, df):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    model_scores = fit_and_score(models=models, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    return model_scores

# Target columns to predict
target_columns = ['career_kills', 'career_wins', 'career_revives']

# Dictionary to store results for each imputation method and each target column
all_results = {}

# Running models on each imputed dataset for each target column
for imputation_method, imputed_df in imputed_dataframes.items():
    print(f"\nRunning models on {imputation_method} data:")
    # Drop the player_name column
    imputed_df = imputed_df.drop(columns=['player_name'])
    
    imputation_results = {}
    for target_column in target_columns:
        print(f"  Predicting {target_column}...")
        model_scores = run_model_prediction(target_column, imputed_df)
        imputation_results[target_column] = model_scores
    all_results[imputation_method] = imputation_results

# Results
for imputation_method, imputation_results in all_results.items():
    print(f"\nResults for {imputation_method}:")
    for target_column, model_scores in imputation_results.items():
        print(f"  Target: {target_column}")
        for model_name, score in model_scores.items():
            print(f"    {model_name}: {score:.4f}")

# Find the best model and imputation method for each target
best_combinations = {}
for target_column in target_columns:
    best_score = float('-inf')  # Initialize to the lowest possible score
    best_model = ""
    best_imputation = ""
    for imputation_method, imputation_results in all_results.items():
        for model_name, score in imputation_results[target_column].items():
            if score > best_score:
                best_score = score
                best_model = model_name
                best_imputation = imputation_method
    best_combinations[target_column] = (best_imputation, best_model, best_score)

print("\nBest Combinations:")
for target, (imputation, model, score) in best_combinations.items():
    print(f"{target}: {imputation} with {model} (Score: {score:.4f})")



Running models on Mode Imputation data:
  Predicting career_kills...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1559, number of used features: 2
[LightGBM] [Info] Start training from score 95757.116100
  Predicting career_wins...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1559, number of used features: 2
[LightGBM] [Info] Start training from score 5735.182168
  Predicting career_revives...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

In [11]:
print("Career Kills Distribution:")
print(Career_Stats_df['career_kills'].describe())
print("\nCareer Wins Distribution:")
print(Career_Stats_df['career_wins'].describe())
print("\nCareer Revives Distribution:")
print(Career_Stats_df['career_revives'].describe())


Career Kills Distribution:
count      1904.000000
mean      96842.709559
std       29643.591753
min        5374.000000
25%       78829.000000
50%       89994.000000
75%      107428.000000
max      371099.000000
Name: career_kills, dtype: float64

Career Wins Distribution:
count      973.00000
mean      6507.21480
std       2563.48393
min         52.00000
25%       4721.00000
50%       6099.00000
75%       7716.00000
max      21010.00000
Name: career_wins, dtype: float64

Career Revives Distribution:
count      896.000000
mean     13735.802455
std       5901.073875
min       3288.000000
25%      10250.000000
50%      12159.500000
75%      15508.250000
max      67615.000000
Name: career_revives, dtype: float64


# Tuning Existing Imputation Methods

# Tuning KNN Imputer

In [24]:
from sklearn.impute import KNNImputer

# Tune the number of neighbors
knn_imputer_5 = KNNImputer(n_neighbors=5)
df_knn_imputed_5 = pd.DataFrame(knn_imputer_5.fit_transform(numeric_df), columns=numeric_df.columns)
df_knn_imputed_5.insert(0, 'player_name', player_names)

knn_imputer_10 = KNNImputer(n_neighbors=10)
df_knn_imputed_10 = pd.DataFrame(knn_imputer_10.fit_transform(numeric_df), columns=numeric_df.columns)
df_knn_imputed_10.insert(0, 'player_name', player_names)

knn_imputer_15 = KNNImputer(n_neighbors=15)
df_knn_imputed_15 = pd.DataFrame(knn_imputer_15.fit_transform(numeric_df), columns=numeric_df.columns)
df_knn_imputed_15.insert(0, 'player_name', player_names)


# Deep Learning-Based Imputation
Using Autoencoders for Imputation

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define an autoencoder model
def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    encoded = Dense(16, activation='relu')(encoded)
    decoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(decoded)
    output_layer = Dense(input_dim, activation='linear')(decoded)
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    return autoencoder

# Prepare the data for the autoencoder
numeric_df_imputed = numeric_df.fillna(numeric_df.mean())  # Initial imputation to handle NaNs
X_train_ae = numeric_df_imputed.values

# Build and train the autoencoder
input_dim = X_train_ae.shape[1]
autoencoder = build_autoencoder(input_dim)
autoencoder.fit(X_train_ae, X_train_ae, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

# Use the trained autoencoder to fill missing values
encoded_data = autoencoder.predict(X_train_ae)
df_autoencoder_imputed = pd.DataFrame(encoded_data, columns=numeric_df.columns)
df_autoencoder_imputed.insert(0, 'player_name', player_names)


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


# Cross-Validation for Imputation Evaluation
Cross-Validation Setup

In [26]:
from sklearn.model_selection import KFold

def cross_validate_imputation(imputed_df, target_column, models, n_splits=5):
    X = imputed_df.drop(columns=[target_column])
    y = imputed_df[target_column]
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = {name: [] for name in models.keys()}
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            score = model.score(X_test, y_test)
            cv_scores[name].append(score)
    
    # Average scores for each model
    avg_scores = {name: np.mean(scores) for name, scores in cv_scores.items()}
    return avg_scores

# Evaluate imputed datasets
imputed_dataframes = {
    "Mode Imputation": df_mode_imputed,
    "Logistic Regression Imputation": df_lr_imputed,
    "Random Forest Imputation": df_rf_imputed,
    "KNN Imputation (5 neighbors)": df_knn_imputed_5,
    "KNN Imputation (10 neighbors)": df_knn_imputed_10,
    "KNN Imputation (15 neighbors)": df_knn_imputed_15,
    "Mean Imputation": df_mean_imputed,
    "Median Imputation": df_median_imputed,
    "XGBoost Imputation": df_xgb_imputed,
    "MICE Imputation": df_mice_imputed,
    "Autoencoder Imputation": df_autoencoder_imputed
}

# Evaluate using cross-validation
all_cv_results = {}
for imputation_method, imputed_df in imputed_dataframes.items():
    print(f"\nCross-validating {imputation_method} data...")
    imputed_df = imputed_df.drop(columns=['player_name'])
    cv_results = {}
    for target_column in target_columns:
        print(f"  Predicting {target_column}...")
        cv_scores = cross_validate_imputation(imputed_df, target_column, models)
        cv_results[target_column] = cv_scores
    all_cv_results[imputation_method] = cv_results

# Print cross-validation results
for imputation_method, cv_results in all_cv_results.items():
    print(f"\nCross-validation results for {imputation_method}:")
    for target_column, model_scores in cv_results.items():
        print(f"  Target: {target_column}")
        for model_name, score in model_scores.items():
            print(f"    {model_name}: {score:.4f}")

# Find the best imputation method based on cross-validation
best_imputation_methods = {}
for target_column in target_columns:
    best_score = float('-inf')
    best_method = ""
    best_model = ""
    for imputation_method, cv_results in all_cv_results.items():
        for model_name, score in cv_results[target_column].items():
            if score > best_score:
                best_score = score
                best_method = imputation_method
                best_model = model_name
    best_imputation_methods[target_column] = (best_method, best_model, best_score)

print("\nBest imputation methods based on cross-validation:")
for target, (method, model, score) in best_imputation_methods.items():
    print(f"{target}: {method} with {model} (Score: {score:.4f})")



Cross-validating Mode Imputation data...
  Predicting career_kills...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1559, number of used features: 2
[LightGBM] [Info] Start training from score 96214.240539
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1559, number of used features: 2
[LightGBM] [Info] Start training from score 96511.686337
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the t

In [4]:
from pandas_profiling import ProfileReport

Career_Stats_LR_Imputed_Data = pd.read_csv('C:/Users/altaa/Documents/GitHub/Apex-Legends-Research/Data_Retrieval/py_files/Notebook_For_Data_Preprocessing/Career_Stats_lr_imputed.csv', delimiter=',')
df_report = ProfileReport(Career_Stats_LR_Imputed_Data, minimal=True)
df_report.to_file(output_file='Career_Stats_For_LR_Imputed_Data.html')

  from pandas_profiling import ProfileReport


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]