# Correlation Check With Features and Target

## 1- Initial Preprocessing

In [1]:
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../'))

from Models.LinearRegression import LinearRegression
from Utils.Preprocessor import Preprocessor
from Utils.Utils import root_mean_squared_error, train_test_split, initial_preprocessing

In [2]:
# Read the data
train = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
# Remove unnecessary features based on exploratory data analysis part 1.
train = initial_preprocessing(train)

In [4]:
X = train.drop(columns=["num_wins_agent1", "num_draws_agent1", "num_losses_agent1", "utility_agent1"], axis=1)
y = train["utility_agent1"]
y1 = train["num_wins_agent1"]
y2 = train["num_draws_agent1"]
y3 = train["num_losses_agent1"]

In [5]:
# Preprocess the data
preprocessor = Preprocessor(normalize=True, one_hot_encode=True)

X = preprocessor.fit_transform(X)

# Convert back to pandas dataframe
X = pd.DataFrame(X, columns=preprocessor.get_column_names())

## 2- Correlation Check with Target

### 2.1- utility_agent1

In [7]:
def calculate_feature_correlations(X, y):
    # Drop categorical columns
    X = X.drop(columns=X.select_dtypes(include=['category']).columns, axis=1)
    
    # Store column names / for converting back to DataFrame
    colnames = X.columns
    
    # Convert to numpy arrays
    X = X.values
    y = y.values


    # corr calculation
    feature_means = np.mean(X, axis=0)
    target_mean = np.mean(y)
    feature_std = np.sqrt(np.sum((X - feature_means) ** 2, axis=0))
    target_std = np.sqrt(np.sum((y - target_mean) ** 2))
    
    # Reshape y for broadcasting
    y_reshaped = y.reshape(-1, 1)
    
    numerator = np.sum((X - feature_means) * (y_reshaped - target_mean), axis=0)
    denominator = feature_std * target_std
    denominator[denominator == 0] = 1000000  # Substitute large value for zero denominator to effectively make correlation zero if denominator is zero
    feature_correlations = numerator / denominator
    
    # Create a DataFrame with results
    correlation_df = pd.DataFrame({
        "Feature": colnames,
        "Correlation": feature_correlations
    }).sort_values(by="Correlation", ascending=False)
    
    return correlation_df

In [8]:
correlation_utility_agent1 = calculate_feature_correlations(X, y)

In [9]:
correlation_utility_agent1.head(10)

Unnamed: 0,Feature,Correlation
387,AdvantageP1,0.442847
93,PiecesPlacedOutsideBoard,0.089204
286,Phase,0.079615
550,p1_exploration,0.076133
72,Hand,0.075179
73,NumContainers,0.063279
287,NumPlayPhase,0.06033
554,p1_selection_UCB1Tuned,0.059548
154,FromToDecision,0.059189
33,ThreeMensMorrisBoard,0.056606


In [10]:
correlation_utility_agent1.tail(10)

Unnamed: 0,Feature,Correlation
519,TaflComponent,-0.049401
12,Tiling,-0.050846
11,PolygonShape,-0.051112
526,ShowPieceState,-0.052241
555,p1_playout_NST,-0.052393
475,Conjunction,-0.052703
92,PiecesPlacedOnBoard,-0.054233
3,Shape,-0.054465
560,p2_selection_UCB1Tuned,-0.058114
551,p2_exploration,-0.076252


### 2.1- num_wins_agent1

In [11]:
correlation_num_wins_agent1 = calculate_feature_correlations(X, y1)

In [12]:
correlation_num_wins_agent1.head(10)

Unnamed: 0,Feature,Correlation
387,AdvantageP1,0.389072
389,Completion,0.314977
110,SingleSiteMoves,0.110007
115,RemoveDecision,0.080307
116,RemoveDecisionFrequency,0.07211
111,AddDecision,0.069312
1,AsymmetricPiecesType,0.068292
112,AddDecisionFrequency,0.065406
52,NumCorners,0.062774
550,p1_exploration,0.060513


In [13]:
correlation_num_wins_agent1.tail(10)

Unnamed: 0,Feature,Correlation
388,Balance,-0.124447
99,NumStartComponents,-0.130388
374,DrawFrequency,-0.134934
376,DurationActions,-0.164809
392,OutcomeUniformity,-0.195809
382,GameTreeComplexity,-0.22614
377,DurationMoves,-0.243908
378,DurationTurns,-0.252856
391,Timeouts,-0.282967
390,Drawishness,-0.312753


### 2.2- num_draws_agent1

In [14]:
correlation_num_draws_agent1 = calculate_feature_correlations(X, y2)

In [15]:
correlation_num_draws_agent1.head(10)

Unnamed: 0,Feature,Correlation
390,Drawishness,0.677952
391,Timeouts,0.61471
378,DurationTurns,0.555967
377,DurationMoves,0.532691
382,GameTreeComplexity,0.486117
392,OutcomeUniformity,0.462496
376,DurationActions,0.360513
374,DrawFrequency,0.314832
99,NumStartComponents,0.260798
388,Balance,0.225678


In [16]:
correlation_num_draws_agent1.tail(10)

Unnamed: 0,Feature,Correlation
476,Disjunction,-0.109217
241,Group,-0.109456
14,HexTiling,-0.109519
402,BoardSitesOccupiedMaxDecrease,-0.11218
115,RemoveDecision,-0.122399
381,DecisionMoves,-0.128625
112,AddDecisionFrequency,-0.147525
111,AddDecision,-0.157628
110,SingleSiteMoves,-0.214452
389,Completion,-0.676826


### 2.3- num_losses_agent1

In [17]:
correlation_num_losses_agent1 = calculate_feature_correlations(X, y3)

In [18]:
correlation_num_losses_agent1.head(10)

Unnamed: 0,Feature,Correlation
389,Completion,0.273189
110,SingleSiteMoves,0.078352
193,RemoveEffectFrequency,0.073616
12,Tiling,0.071929
11,PolygonShape,0.071534
3,Shape,0.071314
111,AddDecision,0.066895
241,Group,0.064701
10,RegularShape,0.063672
547,Trigger,0.063236


In [19]:
correlation_num_losses_agent1.tail(10)

Unnamed: 0,Feature,Correlation
286,Phase,-0.117593
374,DrawFrequency,-0.139882
376,DurationActions,-0.145386
382,GameTreeComplexity,-0.197588
392,OutcomeUniformity,-0.209482
377,DurationMoves,-0.217724
378,DurationTurns,-0.230183
391,Timeouts,-0.250679
390,Drawishness,-0.276366
387,AdvantageP1,-0.395224
