In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso


In [11]:
#Leitura dos dados
bio = pd.read_csv('biodegradable_a.csv')

#Remoção de missing values, para poder aplicar o Lasso Regression
bio = bio.dropna(axis=0)

#Conjunto dos dados
X = bio.values[:,range(0,41)]

#Variável independente -> Como era categorica teve de ser passada para variáveis dummy, para que se possa aplicar Lasso Regression
y_bio = pd.get_dummies(bio['Biodegradable'])
y_bio = y_bio.dropna(axis=0)


#Features
features = np.array(bio.columns)[range(0,42)]

featuresl = []
for i in features:
    featuresl.append(i)





In [12]:
#Separação do nosso conjunto de dados em conjunto de treino e conjunto de teste
X_train, X_test, y_train, y_test = train_test_split(X,y_bio,test_size = 0.33, random_state=42)


In [13]:
#Standardização dos dados, porque os dados ainda não estão normalizados
#O sci-kit-learn tem um tipo de objeto chamado pipeline que pode ser usado para isso

pipeline = Pipeline([('scaler', StandardScaler()),('model',Lasso())])
print(pipeline)

Pipeline(steps=[('scaler', StandardScaler()), ('model', Lasso())])


In [14]:
#O nosso objetivo agora é ótimizar o parâmetro alpha da regressão de Lasso. 
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.00001,0.001,0.00001)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3
                      )


In [23]:
#fit the grid search
#search.fit(X_train, y_train)

In [16]:
#O melhor valor para alpha é então: 0.001
search.best_params_

{'model__alpha': 0.0009600000000000001}

In [17]:
#Agora que temos o valor de aplha, queremos determinar o valor dos coeficientes da Regressão de Lasso
coefficients = search.best_estimator_.named_steps['model'].coef_
print(coefficients)

[[ 0.1220188   0.03612272  0.01719315  0.04732871  0.00663032  0.08193625
   0.07249524 -0.15594364  0.05800591 -0.02145448  0.00762294 -0.00720335
   0.11210142 -0.06594598 -0.13435029  0.0294343   0.09197105 -0.0265986
  -0.01888669  0.04113896 -0.          0.06773263  0.05873283  0.04897446
   0.08297718 -0.01929874 -0.00600967 -0.00533039 -0.0376385  -0.04636005
   0.         -0.01658712 -0.00915659  0.10415969 -0.01758548 -0.02983524
  -0.10021833  0.01937341  0.         -0.02571369 -0.07645547]
 [-0.1220188  -0.03612272 -0.01719315 -0.04732871 -0.00663032 -0.08193625
  -0.07249524  0.15594364 -0.05800591  0.02145448 -0.00762294  0.00720335
  -0.11210142  0.06594598  0.13435029 -0.0294343  -0.09197105  0.0265986
   0.01888669 -0.04113896  0.         -0.06773263 -0.05873283 -0.04897446
  -0.08297718  0.01929874  0.00600967  0.00533039  0.0376385   0.04636005
  -0.          0.01658712  0.00915659 -0.10415969  0.01758548  0.02983524
   0.10021833 -0.01937341 -0.          0.02571369  

In [18]:
#Agora vamos ver o quão importante é cada feature, para isso, teremos de avaliar o valor absoluto do seu coeficiente:
importance = np.abs(coefficients)
print(importance)
soma = 0
lista = []
lista2 = []
bfeatures = []
for i in range(0,41):
        lista.append(importance[0][i])
print(lista)

for i in lista:
    if i > 0.1:
        lista2.append(lista.index(i))
print(lista2)
for i in lista2:
    bfeatures.append(features[i])
print(bfeatures)

[[0.1220188  0.03612272 0.01719315 0.04732871 0.00663032 0.08193625
  0.07249524 0.15594364 0.05800591 0.02145448 0.00762294 0.00720335
  0.11210142 0.06594598 0.13435029 0.0294343  0.09197105 0.0265986
  0.01888669 0.04113896 0.         0.06773263 0.05873283 0.04897446
  0.08297718 0.01929874 0.00600967 0.00533039 0.0376385  0.04636005
  0.         0.01658712 0.00915659 0.10415969 0.01758548 0.02983524
  0.10021833 0.01937341 0.         0.02571369 0.07645547]
 [0.1220188  0.03612272 0.01719315 0.04732871 0.00663032 0.08193625
  0.07249524 0.15594364 0.05800591 0.02145448 0.00762294 0.00720335
  0.11210142 0.06594598 0.13435029 0.0294343  0.09197105 0.0265986
  0.01888669 0.04113896 0.         0.06773263 0.05873283 0.04897446
  0.08297718 0.01929874 0.00600967 0.00533039 0.0376385  0.04636005
  0.         0.01658712 0.00915659 0.10415969 0.01758548 0.02983524
  0.10021833 0.01937341 0.         0.02571369 0.07645547]]
[0.12201879632716901, 0.03612271805446277, 0.01719315317257332, 0.047

In [179]:
#Selecionar as features mais importantes (as que têm importancia > 0.1) (deve haver uma forma mais fácil de fazer isto)
soma = 0
lista = []
lista2 = []
bfeatures = []
for i in range(0,41):
        lista.append(importance[0][i])
for i in lista:
    if i > 0.05:
        lista2.append(lista.index(i))
for i in lista2:
    bfeatures.append(features[i])
print(bfeatures)
len(bfeatures)
#

['SpMax_L', 'NssssC', 'nCb', 'C', 'nCp', 'HyWi_B', 'LOC', 'SM6_L', 'Me', 'SpPosA_B', 'nCIR', 'B03', 'F02_CN', 'Psi_i_A', 'nX']


15

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import datasets

In [35]:
pipeline = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [36]:

y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),
                mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

MSE train: 0.046, test: 0.097
R^2 train: 0.657, test: 0.325
