# Testing for Collinearity with One Hot Encoding Classes
Andrew D'Amico  
MSDS 422, Northwestern University  
January 11, 2025

In [285]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

## Create dataset

In [286]:
def create_dataset (n=100, seed=10, classes=['Apples', 'Oranges', 'Bananas'], sparse=True, verbose=True):

    """
    Creates a toy dataset with two continuous variables and one categorical variable.

    Args:
        n (int): number of observations
        seed (int): seed of the random number for reproducability
        classes (list): classes to include in the categorical variable
        sparse (bool): instructs the function if there should be empty values in the categorical variable.
        verbose (bool): if verbose, the data output will include a describe() call as well as header observations.

    Returns:
        a dataset containing the continuous variables Value_A and Value_B, and the categorical variable Class.

    """

    # Set the seed for reproducability
    np.random.seed(seed)
    
    # Generate Value_A between 0 and 1
    value_a = np.random.rand(n)

    # Generate Value_B between -1 and 1
    value_b = np.random.uniform(-1, 1, n)

    # Create empty variables
    if sparse:
        classes.append("")

    # Randomly choose classes for each row.
    weights = np.random.rand(len(classes)) #randomly determine distribution of classes
    weights /= weights.sum() #Normalize the weights so that they equal 1
    class_values = np.random.choice(classes, size=n, p=weights)

    # Combine into a pandas DataFrame
    df = pd.DataFrame({
        "Value_A": value_a,
        "Value_B": value_b,
        "Class": class_values
    })

    if verbose:
        print (df.describe())
        print ()
        print (df.head())

    return df

In [287]:
data = create_dataset()

          Value_A     Value_B
count  100.000000  100.000000
mean     0.485349   -0.082480
std      0.276362    0.550355
min      0.003948   -0.970730
25%      0.295171   -0.634496
50%      0.496778   -0.046194
75%      0.716371    0.395260
max      0.987625    0.955799

    Value_A   Value_B    Class
0  0.771321  0.156273  Oranges
1  0.020752  0.707868   Apples
2  0.633648 -0.863805         
3  0.748804 -0.070938         
4  0.498507  0.563898  Oranges


## Data Preparation

### Scale Data

In [312]:
def scale_data (data, verbose=True):
    """
    scales all numberical data within a dataframe

    Args:
        data: a pandas dataframe
        verbose (bool): if verbose, the data output will include a describe() call as well as header observations.

    Returns:
        a dataset containing the scaled variables.

    """

    data = data.copy()
    
    scaler = StandardScaler()

    #select the columns to scale based on the datatype
    numeric_features = data.select_dtypes(include=[np.number]).columns.tolist()

    #creates a list to contain our categorical columns
    categorical_features = [] 
    for column in data.columns:
        if column not in numeric_features:
            categorical_features.append(column)

    # Save the column names
    columns = numeric_features + categorical_features
            
    #Create a data pipeline to scale numberic features
    preprocesser = ColumnTransformer(
        transformers = [("scaler", scaler, numeric_features)],
        remainder = 'passthrough'
    )

    pipeline = Pipeline([("preprocessor", preprocesser)])

    # Scale the data
    data_scaled = pipeline.fit_transform(data)

    # reintroduce original column names
    data_scaled = pd.DataFrame(data_scaled, columns=columns)

    for col in data_scaled.columns:
        # If the column is 'object', try converting to numeric
        if data_scaled[col].dtype == 'object':
            try:
                data_scaled[col] = pd.to_numeric(data_scaled[col], errors='raise')
            except ValueError:
                # If it fails, it means the column isn't purely numeric (ignore or handle differently)
                pass

    if verbose:
        print (data_scaled.describe())
        print ()
        print ("#Sample Data")
        print (data_scaled.head(5))

    return (data_scaled)
    

In [289]:
data_scaled = scale_data(data)

            Value_A       Value_B
count  1.000000e+02  1.000000e+02
mean   2.498002e-18 -6.883383e-17
std    1.005038e+00  1.005038e+00
min   -1.750696e+00 -1.622089e+00
25%   -6.916153e-01 -1.008071e+00
50%    4.156365e-02  6.626431e-02
75%    8.401511e-01  8.724306e-01
max    1.826616e+00  1.896065e+00

#Sample Data
    Value_A   Value_B    Class
0  1.039986  0.436001  Oranges
1 -1.689587  1.443302   Apples
2  0.539317 -1.426827         
3  0.958100  0.021077         
4  0.047852  1.180391  Oranges


### Feature Engineering: Perform One Hot Encoding

In [290]:
def encode_data (data, column, dtype=int, drop=True, verbose=True):
    """
    Manually performs one hot encoding

    Args:
        data: a dataframe
        column (string): the name of the column containing the categorical variable
        dtype (string): value to be substituted. Default is int, which returns 1 if true, and 0 if false.
        drop (bool): instructs the function to drop the first column of the dummy variables
        verbose (bool): if verbose, the data output will include a describe() call as well as header observations.

    Returns:
        a dataset containing encoded dummy variables combined with the original dataset. The original categorical variable is always dropped.

    """

    #Create the encoded dummies
    data_encoded = pd.get_dummies(data[column], dtype=dtype)

    # Drop the first column if drop=True
    if drop:
        data_encoded = data_encoded.drop(data_encoded.columns[0], axis=1)
    
    # Drop the categorical variable from the original dataset
    data = data.drop(columns=column)

    # Merge the two tables
    data_merged = pd.merge(data, data_encoded, left_index=True, right_index=True)

    if verbose:
        print (data_merged.describe())
        print ()
        print ('Example:')
        print (data_merged.head(5))
    
    return (data_merged)

In [291]:
data_prepared = encode_data(data_scaled, column=['Class'], drop=False)

            Value_A       Value_B      Class_  Class_Apples  Class_Bananas  \
count  1.000000e+02  1.000000e+02  100.000000    100.000000     100.000000   
mean   2.498002e-18 -6.883383e-17    0.330000      0.270000       0.120000   
std    1.005038e+00  1.005038e+00    0.472582      0.446196       0.326599   
min   -1.750696e+00 -1.622089e+00    0.000000      0.000000       0.000000   
25%   -6.916153e-01 -1.008071e+00    0.000000      0.000000       0.000000   
50%    4.156365e-02  6.626431e-02    0.000000      0.000000       0.000000   
75%    8.401511e-01  8.724306e-01    1.000000      1.000000       0.000000   
max    1.826616e+00  1.896065e+00    1.000000      1.000000       1.000000   

       Class_Oranges  
count     100.000000  
mean        0.280000  
std         0.451261  
min         0.000000  
25%         0.000000  
50%         0.000000  
75%         1.000000  
max         1.000000  

Example:
    Value_A   Value_B  Class_  Class_Apples  Class_Bananas  Class_Oranges
0  1.0

### Create Datasets

In [292]:
def create_experiment_sets (n: int, sparse: bool, sanity_check=True, seed=100, verbose=False):
    """
    Creates two dataframes of identical observations: one which includes the first class and one where the class is dropped.

    Args:
        n (int): number of observations in dataframe
        sparse (bool): determines if categorical variables should contain NaN
        sanity_check (bool): determines if output should show describe() and header observations to check data.
        seed (int): seed for replicatability
        verbose (bool): if verbose, the data output will include a describe() call as well as header observations.

    Returns:
        a two datasets containing encoded dummy variables combined with the original dataset, the first set has the redundant class dropped.

    """
    
    data = create_dataset (n=n, seed=seed, classes=['Apples', 'Oranges', 'Bananas'], sparse=sparse, verbose=verbose)
    data_scaled = scale_data(data, verbose=verbose)
    print ("Creating Set 1...")
    data_prepared_dropped = encode_data(data_scaled, column=['Class'], drop=True, verbose=verbose)
    print ("Creating Set 2...")
    data_prepared_retained = encode_data(data_scaled, column=['Class'], drop=False, verbose=verbose)
    print ("Dataset creation complete.")
    
    if sanity_check:
        print ("#Set 1")
        print (data_prepared_dropped.describe(include='all'))
        print ("\nSet 1 Header")
        print (data_prepared_dropped.head())
        print ("\nSet 2")
        print (data_prepared_retained.describe(include='all'))
        print ("\nSet 1 Header")
        print (data_prepared_retained.head())

    return (data_prepared_dropped, data_prepared_retained)

In [293]:
data_prepared_dropped, data_prepared_retained = create_experiment_sets(n=10000, sparse=True, sanity_check=True)

Creating Set 1...
Creating Set 2...
Dataset creation complete.
#Set 1
            Value_A       Value_B  Class_Apples  Class_Bananas  Class_Oranges
count  1.000000e+04  1.000000e+04  10000.000000   10000.000000   10000.000000
mean   1.723066e-16  1.421085e-18      0.403400       0.213900       0.354800
std    1.000050e+00  1.000050e+00      0.490604       0.410078       0.478477
min   -1.721637e+00 -1.735411e+00      0.000000       0.000000       0.000000
25%   -8.696711e-01 -8.703599e-01      0.000000       0.000000       0.000000
50%   -5.349509e-03  9.432134e-03      0.000000       0.000000       0.000000
75%    8.649521e-01  8.602954e-01      1.000000       0.000000       1.000000
max    1.737735e+00  1.728971e+00      1.000000       1.000000       1.000000

Set 1 Header
    Value_A   Value_B  Class_Apples  Class_Bananas  Class_Oranges
0  0.158222  1.272722             0              1              0
1 -0.758669  1.708736             1              0              0
2 -0.253069  1.5

## Experiment


### Experiment A: Variance Inflation Factor Analysis
Given the dataset containing a categorical variable with three options, does collinearity exist between the dummy variables converted to boolean values (i.e., using One Hot Encoding). A Variance Inflation Factor Analysis is performed: sufficiently high numbers suggest that two features have collinearity. 

Results: The datasets containing a class column retained have a VIF score of the limit appeoaching infinity, and hence suggests perfect collinearity.

In [294]:
def VIF_analysis (data):
    """
    Performs Variance Inflation Factor Analysis on a dataframe.

    Args:
        data (DataFrame): a Pandas Dataframe containing the observations to be analyzed.

    Returns:
        The VIF values for each variable. Values approaching infinity or generating the "divide-by-zero" warning are considered perfectly collinear.

    """
    # Ensure that data has only numberic columns
    df_numeric = data.copy() #select_dtypes(include=[np.number]).copy()
    df_numeric.dropna(inplace=True)

    # Add intercept
    df_numeric["Intercept"] = 1


    # Create an empty DataFrame to hold VIF values
    vif_data = pd.DataFrame()
    vif_data["feature"] = df_numeric.columns
    
    # Calculate VIF for each feature
    vif_data["VIF"] = [
        variance_inflation_factor(df_numeric.values, i)
        for i in range(df_numeric.shape[1])
    ]

    print(vif_data)

In [295]:
def experiment_1 (dataset1, dataset2):
    print ("Dataset 1")
    print ("---------------------")
    VIF_analysis(dataset1)
    print ("\nDataset 2")
    print ("---------------------")
    VIF_analysis(dataset2)

In [296]:
experiment_1(data_prepared_dropped, data_prepared_retained)

Dataset 1
---------------------
         feature        VIF
0        Value_A   1.000574
1        Value_B   1.000555
2   Class_Apples   9.224879
3  Class_Bananas   6.814141
4  Class_Oranges   8.852363
5      Intercept  35.850008

Dataset 2
---------------------
         feature       VIF
0        Value_A  1.000574
1        Value_B  1.000555
2         Class_       inf
3   Class_Apples       inf
4  Class_Bananas       inf
5  Class_Oranges       inf
6      Intercept  0.000000


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


## Experiment B: Eigenvalues of Covariance Matrix
Given the dataset containing a categorical variable with three options, does collinearity exist between the dummy variables converted to boolean values (i.e., using One Hot Encoding). 

By compairing the eigenvalues of the variables, we can check for collinearity. Eigenvalues which approach zero suggest collinearity exists.

Results: the eigenvalues of the the dataset which has retained all of the class columns approach zero, whereas the dataset with a column dropped remains positive. This suggests collinearity.

In [297]:
def determine_eigenvalues (data, return_matrix=False, verbose=True):
    """
    Checks for the eigenvalues and eigenvectors on a dataframe.

    Args:
        data (DataFrame): a Pandas Dataframe containing the observations to be analyzed.
        return_matrix (bool): determines if the covariance matrix and eigenvalues should be returned for use in other analysis
        verbose (bool): determines if the covariance matrix, eigenvalues, and eigenvectors should be printed.

    Returns:
        The  covariance matrix, eigenvalues, and eigenvectors in printed form and the covariance matrix and eigenvectors as a dataset.

    """
    cov_matrix = data.cov()
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    if verbose:
        # Print the results
        print("Covariance Matrix:")
        print(cov_matrix)
        
        print("\nEigenvalues:")
        print(eigenvalues)
        
        print("\nEigenvectors:")
        print(eigenvectors)

    if return_matrix:
        return (cov_matrix, eigenvectors)

In [298]:
def experiment_2 (dataset1, dataset2):
    print ("Dataset 1")
    print ("---------------------")
    determine_eigenvalues (dataset1)
    print ("\nDataset 2")
    print ("---------------------")
    determine_eigenvalues (dataset2)


In [299]:
experiment_2(data_prepared_dropped, data_prepared_retained)

Dataset 1
---------------------
Covariance Matrix:
                Value_A   Value_B  Class_Apples  Class_Bananas  Class_Oranges
Value_A        1.000100 -0.006939     -0.004235       0.009390      -0.004470
Value_B       -0.006939  1.000100      0.003896      -0.006603       0.005055
Class_Apples  -0.004235  0.003896      0.240693      -0.086296      -0.143141
Class_Bananas  0.009390 -0.006603     -0.086296       0.168164      -0.075899
Class_Oranges -0.004470  0.005055     -0.143141      -0.075899       0.228940

Eigenvalues:
[1.00731128 0.99316625 0.00901941 0.24985119 0.37864787]

Eigenvectors:
[[-7.08369846e-01  7.05685031e-01 -5.08812436e-04 -1.48128329e-02
  -1.05793189e-03]
 [ 7.05586591e-01  7.08521201e-01 -1.29093813e-03  1.19807339e-02
  -2.56248765e-04]
 [ 7.56362849e-03 -6.61924906e-04  5.72785303e-01 -3.60331201e-01
  -7.36220603e-01]
 [-1.50456746e-02  2.38911921e-03  5.84603542e-01  8.09049904e-01
   5.86930225e-02]
 [ 8.72597834e-03  4.45833761e-04  5.74590088e-01 -4.63

## Experiment C: Condition Number
Given the dataset containing a categorical variable with three options, does collinearity exist between the dummy variables converted to boolean values (i.e., using One Hot Encoding)

To test this, the condition number is calculated. Numbers which are sufficiently large (approaching infinity) are considered to suggest collinearity.

Result: the condition number of the dataset with all of the columns retained approaches infinity, whereas the dataset where the columns were dropped remain low. Collinearity is detected.

In [311]:
def Condition_number_analysis (data_with, data_without):
    """
    Calculates the condition number of two dataframes (one with all of the columns retained, and one with one of the columns dropped)

    Args:
        data_with (DataFrame): a Pandas Dataframe containing the observations to be analyzed with all columns retained
        data_without (DataFrame): a Pandas Dataframe containing the observations to be analyzed with one column dropped

    Returns:
        The condition number of both dataframes.

    """
    
    cov_with, _ = determine_eigenvalues(data_with, return_matrix=True, verbose=False)
    cov_without, _t = determine_eigenvalues(data_without, return_matrix=True, verbose=False)
    
    cond_with = np.linalg.cond(cov_with)
    print("\nCondition Number (WITH):", cond_with)
    cond_without = np.linalg.cond(cov_without)
    print("Condition Number (WITHOUT):", cond_without)

In [310]:
Condition_number_analysis (data_with=data_prepared_retained, data_without=data_prepared_dropped)


Condition Number (WITH): 1421210016005100.5
Condition Number (WITHOUT): 111.68263132975727
