# Drop Highly Correlated Features

Preliminaries

In [1]:
# Load libraries
import pandas as pd
import numpy as np

# **Load Data**


In [8]:
# Create feature matrix with two highly correlated features
X = np.array([[1, 1, 1,3,0],
              [2, 2, 0,2,1],
              [3, 3, 1,1,0],
              [4, 4, 0,0,1],
              [5, 5, 1,2,0],
              [6, 6, 0,5,1],
              [7, 7, 1,3,0],
              [8, 7, 0,4,1],
              [9, 7, 1,0,1]])

# Convert feature matrix into DataFrame
df = pd.DataFrame(X)

# View the data frame
df

Unnamed: 0,0,1,2,3,4
0,1,1,1,3,0
1,2,2,0,2,1
2,3,3,1,1,0
3,4,4,0,0,1
4,5,5,1,2,0
5,6,6,0,5,1
6,7,7,1,3,0
7,8,7,0,4,1
8,9,7,1,0,1


# Identify Highly Correlated Features

In [9]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# **Drop Marked Features**

In [10]:
# Drop features 
df.drop(df[to_drop], axis=1)

Unnamed: 0,0,2,3,4
0,1,1,3,0
1,2,0,2,1
2,3,1,1,0
3,4,0,0,1
4,5,1,2,0
5,6,0,5,1
6,7,1,3,0
7,8,0,4,1
8,9,1,0,1
