# Support Vector Machine - SVM

In [1]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups

# NLP
from sklearn.feature_extraction.text import TfidfVectorizer

# SVM --> Support Vector Classification
from sklearn.svm import SVC

# pipeline
from sklearn.pipeline import Pipeline

# Numpy
import numpy as np

In [2]:


categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

### Usando TfidfVectorizer se crea una matriz con la relevancia de las palabras en relación a los diferentes documentos del dataset de entrenamiento

**use_idf : boolean (default=True): Enable inverse-document-frequency reweighting.(???)** 

In [3]:
X_train_tf = TfidfVectorizer(use_idf=False).fit_transform(twenty_train.data)
X_train_tf.shape

(2257, 35788)

In [4]:
X_train_tf.dtype

dtype('float64')

**El objetivo de está clasificación es usar el dataset de noticias de diferentes categorias (['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'])
Obtener la relevancia de las palabras y clasificarlas por categoria
Se usa un Pipeline para entrenar el Tfidf (Term Frequency Inverse Document Frequency) y luego realizar la predicción con SVC (Support Vector Classifier)**

### Pipeline: 
**Secuencialmente aplica transformaciones y finalmente un estimador. Los pasos intermedios deben implementar los métodos _fit_ y _transform_, el estimador final solo requiere implementar el método _fit_.**

In [5]:
text_clf = Pipeline([('tfidf' , TfidfVectorizer()), ('classifier', SVC(kernel='linear'))])

In [6]:
# Entrenando (tfidf o classifier ????)
text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(twenty_test.data)

**Calcular accuracy:**

In [7]:
np.mean(predicted == twenty_test.target)

0.9207723035952063

In [8]:
tf_idf = TfidfVectorizer()
X_train = tf_idf.fit_transform(twenty_train.data)
X_test = tf_idf.transform(twenty_test.data)
y_train = twenty_train.target
y_test = twenty_test.target

# Voting

In [9]:
from sklearn.metrics import accuracy_score

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

### Soft Voting (allowing probability)

In [10]:
svc_clf = SVC(kernel='linear', probability=True)
sgd_clf = SGDClassifier(loss='log')
tree_clf = DecisionTreeClassifier()
voting_clf = VotingClassifier(estimators=[('svc', svc_clf), ('sgd', sgd_clf), ('tree', tree_clf)], voting='soft')

for clf in (svc_clf, sgd_clf, tree_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_predict))

SVC 0.9207723035952063
SGDClassifier 0.9181091877496671
DecisionTreeClassifier 0.7137150466045273
VotingClassifier 0.8994673768308922


### Hard voting (doesn't allow probability)

In [11]:
svc_clf = SVC(kernel='linear')
sgd_clf = SGDClassifier()
tree_clf = DecisionTreeClassifier()
voting_clf = VotingClassifier(estimators=[('svc', svc_clf), ('sgd', sgd_clf), ('tree', tree_clf)], voting='hard')

for clf in (svc_clf, sgd_clf, tree_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_predict))

SVC 0.9207723035952063
SGDClassifier 0.9267643142476698
DecisionTreeClassifier 0.7210386151797603
VotingClassifier 0.9254327563249002


# Bagging (Bootstrap Agreggating)

In [12]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7217043941411452

**Implements several classifiers (the same type e.g. _DecisionTree - multiple trees makes a forest a RandomForest_), each classifier is trained using a bunch of training instances, then the next classifier is trained with the same or different instances, finally a Voting is done.**

In [13]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=200, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7989347536617842

# Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8135818908122503

# Boosting

## AdaBoost
**Boosting multiple _weak_ estimators learn from the error of the previous estimator** 

In [15]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=1500, algorithm='SAMME.R')
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=1500, random_state=None)

In [16]:
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7782956058588548

## XgBoost (Extreme Gradient Boost)

In [17]:
# pip install xgboost
from xgboost import XGBClassifier

In [18]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [19]:
y_pred = xgb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8681757656458056

## LigthGBM (Ligth Gradient Boosting Machine - Microsoft)

In [20]:
# conda install -c conda-forge lightgbm
import lightgbm as lgb

In [21]:
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])

[1]	valid_0's multi_logloss: 1.27964
[2]	valid_0's multi_logloss: 1.19555
[3]	valid_0's multi_logloss: 1.12404
[4]	valid_0's multi_logloss: 1.06169
[5]	valid_0's multi_logloss: 1.00366
[6]	valid_0's multi_logloss: 0.952655
[7]	valid_0's multi_logloss: 0.909478
[8]	valid_0's multi_logloss: 0.869212
[9]	valid_0's multi_logloss: 0.8343
[10]	valid_0's multi_logloss: 0.80072
[11]	valid_0's multi_logloss: 0.771599
[12]	valid_0's multi_logloss: 0.743307
[13]	valid_0's multi_logloss: 0.71649
[14]	valid_0's multi_logloss: 0.691686
[15]	valid_0's multi_logloss: 0.667471
[16]	valid_0's multi_logloss: 0.64493
[17]	valid_0's multi_logloss: 0.625319
[18]	valid_0's multi_logloss: 0.607054
[19]	valid_0's multi_logloss: 0.589882
[20]	valid_0's multi_logloss: 0.573306
[21]	valid_0's multi_logloss: 0.557896
[22]	valid_0's multi_logloss: 0.545648
[23]	valid_0's multi_logloss: 0.531141
[24]	valid_0's multi_logloss: 0.518963
[25]	valid_0's multi_logloss: 0.50745
[26]	valid_0's multi_logloss: 0.495493
[27]	v

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [22]:
y_pred = gbm.predict(X_test)
accuracy_score(y_test, y_pred)

0.8808255659121171

# Stacking

**uses the results from multiple estimators as features, to process it and then make it's own predictions**

In [23]:
# pip install vecstack

In [24]:
from vecstack import StackingTransformer

estimators = [('xgb', xgb_clf), ('ada', ada_clf)]

# StackingTransformer
stack = StackingTransformer(estimators, regression=False, verbose=2)

# Fit
stack = stack.fit(X_train, y_train)

# stacked features
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)

task:         [classification]
n_classes:    [4]
metric:       [accuracy_score]
variant:      [A]
n_estimators: [2]

estimator  0: [xgb: XGBClassifier]
    fold  0:  [0.90619469]
    fold  1:  [0.92730496]
    fold  2:  [0.90780142]
    fold  3:  [0.90070922]
    ----
    MEAN:     [0.91050257] + [0.01005092]

estimator  1: [ada: AdaBoostClassifier]
    fold  0:  [0.85309735]
    fold  1:  [0.86170213]
    fold  2:  [0.81382979]
    fold  3:  [0.84929078]
    ----
    MEAN:     [0.84448001] + [0.01825815]

Train set was detected.
Transforming...

estimator  0: [xgb: XGBClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [ada: AdaBoostClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

Transforming...

estimator  0: [xgb: XGBClassifier]
    model from fold  0: done
    model from fold  

In [25]:
gbm = lgb.LGBMClassifier()
gbm.fit(S_train, y_train, eval_set=[(S_test, y_test)])

[1]	valid_0's multi_logloss: 1.24018
[2]	valid_0's multi_logloss: 1.12659
[3]	valid_0's multi_logloss: 1.03325
[4]	valid_0's multi_logloss: 0.955259
[5]	valid_0's multi_logloss: 0.889316
[6]	valid_0's multi_logloss: 0.833101
[7]	valid_0's multi_logloss: 0.7849
[8]	valid_0's multi_logloss: 0.743402
[9]	valid_0's multi_logloss: 0.707574
[10]	valid_0's multi_logloss: 0.676589
[11]	valid_0's multi_logloss: 0.649767
[12]	valid_0's multi_logloss: 0.626259
[13]	valid_0's multi_logloss: 0.60591
[14]	valid_0's multi_logloss: 0.588312
[15]	valid_0's multi_logloss: 0.573117
[16]	valid_0's multi_logloss: 0.559916
[17]	valid_0's multi_logloss: 0.548567
[18]	valid_0's multi_logloss: 0.538765
[19]	valid_0's multi_logloss: 0.530392
[20]	valid_0's multi_logloss: 0.523268
[21]	valid_0's multi_logloss: 0.51724
[22]	valid_0's multi_logloss: 0.512056
[23]	valid_0's multi_logloss: 0.507702
[24]	valid_0's multi_logloss: 0.504089
[25]	valid_0's multi_logloss: 0.501119
[26]	valid_0's multi_logloss: 0.498708
[2

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [26]:
y_pred = gbm.predict(S_test)
accuracy_score(y_test, y_pred)

0.8615179760319573