# Mutual Information Check With Features and Target

## 1- Initial Preprocessing

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.LinearRegression import LinearRegression
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]
y1 = train["num_wins_agent1"]
y2 = train["num_draws_agent1"]
y3 = train["num_losses_agent1"]

In [5]:
# Preprocess the data
preprocessor = Preprocessor(normalize=True, one_hot_encode=True)

X = preprocessor.fit_transform(X)

# Convert back to pandas dataframe
X = pd.DataFrame(X, columns=preprocessor.get_column_names())

## 2- Mutual Information Check with Target

### 2.1- utility_agent1

In [6]:
import numpy as np
import pandas as pd
from collections import Counter

def calculate_mutual_information(X, y):
    def entropy(values):
        """Calculate the entropy of a dataset."""
        # For continuous variables, use binning
        if values.dtype.kind in ['f', 'i']:
            values = pd.qcut(values, q=20, labels=False, duplicates='drop')
        
        total = len(values)
        counts = Counter(values)
        probabilities = np.array([count / total for count in counts.values()])
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))
    
    def conditional_entropy(feature, target):
        """Calculate the conditional entropy of target given feature."""
        if feature.dtype.kind in ['f', 'i']:
            feature = pd.qcut(feature, q=20, labels=False, duplicates='drop')
            
        total = len(feature)
        unique_values = np.unique(feature)
        cond_entropy = 0
        
        for value in unique_values:
            indices = np.where(feature == value)[0]
            subset = target[indices]
            prob = len(indices) / total
            cond_entropy += prob * entropy(subset)
            
        return cond_entropy
    
    # Input validation
    if not isinstance(X, pd.DataFrame):
        raise TypeError("X must be a pandas DataFrame")
    if not isinstance(y, (pd.Series, np.ndarray)):
        raise TypeError("y must be a pandas Series or numpy array")
    if len(X) != len(y):
        raise ValueError("X and y must have the same number of samples")
        
    # Drop categorical columns and store column names
    numeric_X = X.select_dtypes(include=['int64', 'float64'])
    colnames = numeric_X.columns
    
    # Convert to numpy arrays
    X_values = numeric_X.values
    y_values = y.values if isinstance(y, pd.Series) else y
    
    # Calculate mutual information for each feature
    mutual_info = []
    target_entropy = entropy(y_values)
    
    for i in range(X_values.shape[1]):
        feature = X_values[:, i]
        cond_entropy = conditional_entropy(feature, y_values)
        mi = target_entropy - cond_entropy
        mutual_info.append(mi)
    
    # Create a DataFrame with results
    mutual_info_df = pd.DataFrame({
        "Feature": colnames,
        "Mutual Information": mutual_info
    }).sort_values(by="Mutual Information", ascending=False)
    
    return mutual_info_df

In [7]:
mutual_info_utility_agent1 = calculate_mutual_information(X, y)

In [8]:
mutual_info_utility_agent1.head(10)

Unnamed: 0,Feature,Mutual Information
387,AdvantageP1,0.236968
388,Balance,0.229491
390,Drawishness,0.159779
389,Completion,0.135249
53,NumDirections,0.104664
377,DurationMoves,0.093722
62,NumVertices,0.091477
391,Timeouts,0.090111
392,OutcomeUniformity,0.087115
382,GameTreeComplexity,0.085692


### 2.1- num_wins_agent1

In [9]:
mutual_info_num_wins_agent1 = calculate_mutual_information(X, y1)

In [10]:
mutual_info_num_wins_agent1.head(10)

Unnamed: 0,Feature,Mutual Information
387,AdvantageP1,0.228046
388,Balance,0.204862
120,StepDecisionFrequency,0.181651
122,StepDecisionToEmptyFrequency,0.173052
391,Timeouts,0.157918
425,MoveDistanceMaximum,0.156529
49,NumPlayableSitesOnBoard,0.155699
435,PieceNumberMaximum,0.154638
549,MovesPerSecond,0.152408
62,NumVertices,0.146846


### 2.2- num_draws_agent1

In [11]:
mutual_info_num_draws_agent1 = calculate_mutual_information(X, y2)

In [12]:
mutual_info_num_draws_agent1.head(10)

Unnamed: 0,Feature,Mutual Information
390,Drawishness,0.254014
389,Completion,0.247993
378,DurationTurns,0.197171
377,DurationMoves,0.192472
388,Balance,0.178688
391,Timeouts,0.169072
382,GameTreeComplexity,0.133907
376,DurationActions,0.123147
392,OutcomeUniformity,0.121789
379,DurationTurnsStdDev,0.11889


### 2.3- num_losses_agent1

In [13]:
mutual_info_num_lossess_agent1 = calculate_mutual_information(X, y3)

In [14]:
mutual_info_num_lossess_agent1.head(10)

Unnamed: 0,Feature,Mutual Information
387,AdvantageP1,0.252817
388,Balance,0.220147
549,MovesPerSecond,0.17749
62,NumVertices,0.17344
74,NumPlayableSites,0.17124
53,NumDirections,0.160639
50,NumColumns,0.157141
435,PieceNumberMaximum,0.150564
49,NumPlayableSitesOnBoard,0.146034
392,OutcomeUniformity,0.140932
