In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
import string
import pandas as pd
from base64 import b64decode
import numpy as np

DATA_PATH="/notebook/nas-trainings/arne/DGFISMA/DATA/doc_classifier/processed_eurlex_15_06/train_data.tsv"


data=pd.read_csv(  DATA_PATH  , sep='\t' , header=None ) 

train_data=data[0].tolist()
train_labels=data[2].tolist()

train_data=[ b64decode( doc ).decode().translate(str.maketrans('', '', string.punctuation+'0123456789'  )) for doc in train_data  ]

#feature selection parameters:
penalty_feature_selection='l1'

#classifier parameters
penalty='l2'
loss='squared_hinge'
dual=False
class_weight = 'balanced'

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                         stop_words= 'english' )

#also balanced class weight here...
feature_selection=SelectFromModel(LinearSVC(penalty=penalty_feature_selection, dual=False,
                                                          tol=1e-3, class_weight=class_weight )) 

feature_selection=None

classifier=LinearSVC(penalty=penalty, loss=loss , dual=dual, class_weight=class_weight )

#calibrated the classifier (for predict_proba): 
calibrated_classifier = CalibratedClassifierCV(classifier , cv=5 ) 
    
clf=Pipeline([
( 'vectorizer', vectorizer)  ,
('feature_selection',  feature_selection  )   ,
('classification', calibrated_classifier  )
])

param_grid = {
    'vectorizer__max_df': [0.3,0.4,0.5,0.6,0.7 ] ,
    'feature_selection__estimator__C': [1.0] ,
    'classification__base_estimator__C': list(np.logspace(-3, 1, 5)) }

scoring=['f1', 'precision', 'recall']

search = GridSearchCV(clf, param_grid, scoring=scoring , n_jobs=-1, cv=5, refit=scoring[0], return_train_score=True   )

assert len( train_data ) == len(train_labels)

search.fit(train_data, train_labels)


AttributeError: 'NoneType' object has no attribute 'set_params'

In [2]:
select=search.best_estimator_.named_steps[ 'feature_selection' ]
vec=search.best_estimator_.named_steps[ 'vectorizer' ]
#select.get_support()

i=0
assert( select.get_support().shape[0]  ==  len(  vec.get_feature_names() ) ) 
for selected, feature in zip( select.get_support(), vec.get_feature_names()  ):
    if selected:
        i=i+1
        print(feature)

print( f"{i} selected features" )

adopt
agricultural
aid
arrangements
central
committee
community
competent
credit
delegated
eec
entities
financial
funds
insurance
investment
legal
markets
person
public
regulatory
relevant
reporting
resolution
securities
standards
supervisory
trading
zimbabwe
29 selected features


In [7]:
search.best_index_

2

In [6]:
search.cv_results_['mean_fit_time'][ search.best_index_]

0.5786046028137207

In [5]:
search.cv_results_

{'mean_fit_time': array([0.58215785, 0.58701806, 0.5786046 , 0.58697052, 0.59009318,
        0.57319589, 0.57831411, 0.5692872 , 0.57576685, 0.59532046,
        0.57885499, 0.5940268 , 0.56988211, 0.58117619, 0.57654152,
        0.57936745, 0.59194579, 0.56919346, 0.60071406, 0.57870612,
        0.58755093, 0.59600701, 0.56916709, 0.59031301, 0.51133885]),
 'std_fit_time': array([0.01652619, 0.01878703, 0.01415699, 0.01235206, 0.03311548,
        0.00593881, 0.0294944 , 0.01899819, 0.01108265, 0.03275223,
        0.01957887, 0.02040534, 0.01637712, 0.02033766, 0.01024169,
        0.02949718, 0.04880166, 0.01175549, 0.03795534, 0.00698926,
        0.02613467, 0.04080158, 0.01526601, 0.03606427, 0.06496163]),
 'mean_score_time': array([0.43978467, 0.43769469, 0.43682823, 0.44307761, 0.43354254,
        0.43722115, 0.42885761, 0.4249475 , 0.43938012, 0.42724848,
        0.42183018, 0.43807235, 0.44397459, 0.42537675, 0.42697868,
        0.42488766, 0.42617249, 0.43163643, 0.43242569, 0.42

In [4]:
search.cv_results_['mean_time'][ search.best_index_]

KeyError: 'mean_time'

In [50]:
search.cv_results_['params'][search.best_index_]

{'classification__base_estimator__C': 0.001,
 'feature_selection__estimator__C': 1.0,
 'vectorizer__max_df': 0.5}

In [51]:
search.cv_results_['mean_train_f1'][ search.best_index_ ]
search.cv_results_['std_train_f1'][ search.best_index_ ]

print(search.cv_results_['mean_test_f1'][ search.best_index_ ])
print(search.cv_results_['std_test_f1'][ search.best_index_ ])

0.9897743922743922
0.009607676967506806


In [52]:
clf.get_params()

{'memory': None,
 'steps': [('vectorizer',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.5, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words='english', strip_accents=None, sublinear_tf=True,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('feature_selection',
   SelectFromModel(estimator=LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
        verbose=0),
           max_features=None, norm_order=1, prefit=False, threshold=None)),
  ('classification',
   CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_int

In [4]:
search.best_params_

{'classification__base_estimator__C': 0.01,
 'feature_selection__estimator__C': 1.0,
 'vectorizer__max_df': 0.5}

In [14]:
search.best_index_

0

In [15]:
search.cv_results_



{'mean_fit_time': array([0.58425369, 0.59101958, 0.57284784, 0.57434216, 0.55385809]),
 'std_fit_time': array([0.01231868, 0.01457803, 0.01254693, 0.0157331 , 0.09113937]),
 'mean_score_time': array([0.14300337, 0.14191375, 0.14165807, 0.14373369, 0.12868338]),
 'std_score_time': array([0.01559061, 0.01535501, 0.01452347, 0.01537027, 0.0178605 ]),
 'param_classification__base_estimator__C': masked_array(data=[0.01, 0.05623413251903491, 0.31622776601683794,
                    1.7782794100389228, 10.0],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_feature_selection__estimator__C': masked_array(data=[1.0, 1.0, 1.0, 1.0, 1.0],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vectorizer__max_df': masked_array(data=[0.5, 0.5, 0.5, 0.5, 0.5],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param

In [48]:
search.named_steps['feature_selection']

AttributeError: 'GridSearchCV' object has no attribute 'named_steps'

In [45]:
clf.get_params()

{'memory': None,
 'steps': [('vectorizer',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.5, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words='english', strip_accents=None, sublinear_tf=True,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('feature_selection',
   SelectFromModel(estimator=LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
        verbose=0),
           max_features=None, norm_order=1, prefit=False, threshold=None)),
  ('classification',
   CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_int

In [44]:
search.get_params()

{'cv': 'warn',
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('vectorizer',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.5, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words='english', strip_accents=None, sublinear_tf=True,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('feature_selection',
   SelectFromModel(estimator=LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
        verbose=0),
           max_features=None, norm_order=1, prefit=False, threshold=None)),
  ('classification',
   CalibratedClassifierCV(ba