## Dim Reduction

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.metrics import precision_score, confusion_matrix

In [2]:
# Load
classes = pd.read_csv('../data/classe_census.csv')
previsores = pd.read_csv('../data/previsores_census.csv')
previsores.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country
0,0.030671,2.150579,-1.063611,-0.335437,1.134739,0.921634,-1.317809,-0.277805,0.393668,0.703071,0.148453,-0.21666,-0.035429,0.291569
1,0.837109,1.463736,-1.008707,-0.335437,1.134739,-0.406212,-0.608387,-0.900181,0.393668,0.703071,-0.14592,-0.21666,-2.222153,0.291569
2,-0.042642,0.09005,0.245079,0.181332,-0.42006,-1.734058,-0.135438,-0.277805,0.393668,0.703071,-0.14592,-0.21666,-0.035429,0.291569
3,1.057047,0.09005,0.425801,-2.402511,-1.197459,-0.406212,-0.135438,-0.900181,-1.962621,0.703071,-0.14592,-0.21666,-0.035429,0.291569
4,-0.775768,0.09005,1.408176,-0.335437,1.134739,-0.406212,0.810458,2.211698,-1.962621,-1.422331,-0.14592,-0.21666,-0.035429,-4.054223


In [3]:
X_train, X_test, y_train, y_test = train_test_split(previsores, classes, test_size=0.15, random_state=0)

In [4]:
pca = PCA(n_components = 6)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
pca.explained_variance_ratio_

array([0.151561  , 0.10109701, 0.08980379, 0.08076277, 0.07627678,
       0.07357646])

In [5]:
clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)

print("accuracy", accuracy_score(y_test, y_pred))

accuracy 0.8216990788126919


## Kernel PCA

Para problemas não linearmente separáveis

In [6]:
X_train, X_test, y_train, y_test = train_test_split(previsores, classes, test_size=0.15, random_state=0)

In [None]:
pca = KernelPCA(n_components = 6, kernel='rbf')
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
pca.explained_variance_ratio_

In [None]:
clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)

print("accuracy", accuracy_score(y_test, y_pred))

## LDA

'PCA supervisionado' -> A redução de dimensionalidade das features leva em conta a classe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(previsores, classes, test_size=0.15, random_state=0)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=6)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
lda.explained_variance_ratio_

In [None]:
clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)

print("accuracy", accuracy_score(y_test, y_pred))