We can use **PCA** to *speed up Algorithms*.

Using this example:

https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

# Using PCA to Capture the most important features from this dataset

https://archive.ics.uci.edu/ml/datasets/QSAR+fish+toxicity

In [None]:
import pandas as pd

# Reading data and adding columns (only data in the CSV)
raw_data = pd.read_csv('qsar_fish_toxicity.csv', header=None, delimiter=';')
columns = ['CIC0','SM1_Dz(Z)','GATS1i','NdsCH','NdssC','MLOGP','Quantitative response']
raw_data.columns = columns

In [None]:
# Visualizing Data
raw_data.info()
raw_data.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908 entries, 0 to 907
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CIC0                   908 non-null    float64
 1   SM1_Dz(Z)              908 non-null    float64
 2   GATS1i                 908 non-null    float64
 3   NdsCH                  908 non-null    int64  
 4   NdssC                  908 non-null    int64  
 5   MLOGP                  908 non-null    float64
 6   Quantitative response  908 non-null    float64
dtypes: float64(5), int64(2)
memory usage: 49.8 KB


CIC0                     502
SM1_Dz(Z)                186
GATS1i                   557
NdsCH                      5
NdssC                      7
MLOGP                    559
Quantitative response    827
dtype: int64

In [None]:
raw_data.head()

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP,Quantitative response
0,3.26,0.829,1.676,0,1,1.453,3.77
1,2.189,0.58,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.51
4,2.094,0.827,0.86,0,0,1.886,5.39


In [None]:
# Separating features and Output values.
x = raw_data[raw_data.columns[:-1]]
labels = raw_data['Quantitative response']

## Normalize the data

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
x_norm = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)
x_norm

Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP
0,0.493060,0.381852,0.507132,0.00,0.166667,0.461432
1,0.289409,0.267158,0.185024,0.00,0.000000,0.450261
2,0.277239,0.293874,0.172345,0.00,0.000000,0.450261
3,0.448755,0.152464,0.426307,0.25,0.000000,0.499096
4,0.271344,0.380930,0.183835,0.00,0.000000,0.507501
...,...,...,...,...,...,...
903,0.405781,0.335329,0.725040,0.00,0.333333,0.385147
904,0.567598,0.401658,0.186609,0.50,0.500000,0.730610
905,0.588705,0.421925,0.190967,0.00,1.000000,0.617300
906,0.411485,0.641640,0.269810,0.00,0.166667,0.403234


## Use PCA from SciKit

In [None]:
from sklearn.decomposition import PCA

n=3
pca = PCA(n_components=n)
principalComponents = pca.fit_transform(x_norm)
principalDf = pd.DataFrame(data = principalComponents
             , columns = [[f'PCA {c}' for c in range(1,n+1)]])

In [None]:
# Showing the total variance of the PCA 
print(pca.explained_variance_ratio_)
print(f'Total variance of the {n} PC: {pca.explained_variance_ratio_.sum()}')

#showing the variance of all features with regards of each PCA
pd.DataFrame(pca.components_,columns=x_norm.columns,
             index = [f'PC-{r}' for r in range(1,n+1)])

[0.30186829 0.23857445 0.18906648]
Total variance of the 3 PC: 0.7295092192686011


Unnamed: 0,CIC0,SM1_Dz(Z),GATS1i,NdsCH,NdssC,MLOGP
PC-1,-0.133841,0.842962,-0.362548,-0.136897,0.08492,0.3378
PC-2,0.57646,-0.250202,-0.289308,0.314353,0.185565,0.62301
PC-3,0.251936,0.330675,0.569564,0.273462,0.633699,-0.162553


In [145]:
import numpy as np

PCA_values = np.asarray(
    [[c,value] for value, c in zip(
        np.max(np.abs(pca.components_), axis=0), x_norm.columns)])
print(PCA_values)

PCA_names = np.sort(PCA_values, axis=0)[::-1]
print(f'The main {n} features on this example are {PCA_names[:n,0].tolist()}')

[['CIC0' '0.5764600904560264']
 ['SM1_Dz(Z)' '0.8429617335991899']
 ['GATS1i' '0.5695638503552872']
 ['NdsCH' '0.31435326584363565']
 ['NdssC' '0.6336992597103818']
 ['MLOGP' '0.6230099326493193']]
The main 3 features on this example are ['SM1_Dz(Z)', 'NdssC', 'NdsCH']
