In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import plot_confusion_matrix

### USING BOOSTED kNN AND LogReg ON REGULAR TOKENIZED DATASETS

In [2]:
token_self = pd.read_csv('../data/tokenself.csv')
token_title = pd.read_csv('../data/tokentitle.csv')
token_total = pd.read_csv('../data/tokentotal.csv')

token_title.drop('Unnamed: 0', axis = 1, inplace= True)
token_self.drop('Unnamed: 0', axis = 1, inplace = True)
token_total.drop('Unnamed: 0', axis = 1, inplace = True)

In [2]:
AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
GradientBoostingClassifier()

knn_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

logreg_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

vote = VotingClassifier([
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
    ('gb', GradientBoostingClassifier()),
    ('tree', DecisionTreeClassifier()),
    ('knn_pipe', knn_pipe)
])

vote2 = VotingClassifier([
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
    ('gb', GradientBoostingClassifier()),
    ('tree', DecisionTreeClassifier()),
    ('logreg_pipe', logreg_pipe)
])

params = {
    'gb__n_estimators': [50, 100],
    'ada__base_estimator__max_depth': [2],
    'knn_pipe__knn__n_neighbors': [3, 4]
}

params2 = {
    'gb__n_estimators': [50, 100],
    'ada__base_estimator__max_depth': [2]   
}
gs = GridSearchCV(vote, param_grid=params)
gs2 = GridSearchCV(vote2, param_grid=params2)

In [4]:
X1 = token_title.drop('lib_or_neolib', axis = 1)
y1 = token_title['lib_or_neolib']
X2 = token_self.drop('lib_or_neolib', axis=1)
y2 = token_self['lib_or_neolib']
X3 = token_total.drop('lib_or_neolib', axis=1)
y3 = token_total['lib_or_neolib']

In [15]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state = 42)
gs.fit(X1_train, y1_train)
gs2.fit(X1_train, y1_train)
gs.best_score_, gs2.best_score_

(0.6203833808444255, 0.6264191600637472)

In [16]:
gs.best_params_

{'ada__base_estimator__max_depth': 2,
 'gb__n_estimators': 50,
 'knn_pipe__knn__n_neighbors': 3}

In [17]:
gs2.best_params_

{'ada__base_estimator__max_depth': 2, 'gb__n_estimators': 100}

In [18]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 42)
gs.fit(X2_train, y2_train)
gs2.fit(X2_train, y2_train)
gs.best_score_, gs2.best_score_

(0.6877237851662404, 0.6849104859335039)

In [19]:
gs.best_params_

{'ada__base_estimator__max_depth': 2,
 'gb__n_estimators': 50,
 'knn_pipe__knn__n_neighbors': 3}

In [20]:
gs2.best_params_

{'ada__base_estimator__max_depth': 2, 'gb__n_estimators': 100}

In [21]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state = 42)
gs.fit(X3_train, y3_train)
gs2.fit(X3_train, y3_train)
gs.best_score_, gs2.best_score_

(0.6539359385872372, 0.6619806513882965)

In [22]:
gs.best_params_

{'ada__base_estimator__max_depth': 2,
 'gb__n_estimators': 100,
 'knn_pipe__knn__n_neighbors': 4}

In [23]:
gs2.best_params_

{'ada__base_estimator__max_depth': 2, 'gb__n_estimators': 50}

### USING BOOSTED kNN AND LogReg ON IDEAL TOKENIZED DATASETS

In [3]:
token_total = pd.read_csv('../data/tokentotalideal.csv')
token_title = pd.read_csv('../data/tokentitleideal.csv')
token_self = pd.read_csv('../data/tokenselfideal.csv')

token_title.drop('Unnamed: 0', axis = 1, inplace= True)
token_self.drop('Unnamed: 0', axis = 1, inplace = True)
token_total.drop('Unnamed: 0', axis = 1, inplace = True)

In [4]:
AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
GradientBoostingClassifier()

knn_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

logreg_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

vote = VotingClassifier([
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
    ('gb', GradientBoostingClassifier()),
    ('tree', DecisionTreeClassifier()),
    ('knn_pipe', knn_pipe)
])

vote2 = VotingClassifier([
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
    ('gb', GradientBoostingClassifier()),
    ('tree', DecisionTreeClassifier()),
    ('logreg_pipe', logreg_pipe)
])

params = {
    'gb__n_estimators': [50, 100],
    'ada__base_estimator__max_depth': [2],
    'knn_pipe__knn__n_neighbors': [3, 4]
}

params2 = {
    'gb__n_estimators': [50, 100],
    'ada__base_estimator__max_depth': [2]   
}
gs = GridSearchCV(vote, param_grid=params)
gs2 = GridSearchCV(vote2, param_grid=params2)

In [5]:
X1 = token_title.drop('lib_or_neolib', axis = 1)
y1 = token_title['lib_or_neolib']
X2 = token_self.drop('lib_or_neolib', axis=1)
y2 = token_self['lib_or_neolib']
X3 = token_total.drop('lib_or_neolib', axis=1)
y3 = token_total['lib_or_neolib']

In [6]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state = 42)
gs.fit(X1_train, y1_train)
gs2.fit(X1_train, y1_train)
gs.best_score_, gs2.best_score_

(0.6237570424906287, 0.6277636865614689)

In [7]:
gs.best_params_

{'ada__base_estimator__max_depth': 2,
 'gb__n_estimators': 50,
 'knn_pipe__knn__n_neighbors': 3}

In [8]:
gs2.best_params_

{'ada__base_estimator__max_depth': 2, 'gb__n_estimators': 100}

In [9]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 42)
gs.fit(X2_train, y2_train)
gs2.fit(X2_train, y2_train)
gs.best_score_, gs2.best_score_

(0.638235294117647, 0.6528132992327367)

In [10]:
gs.best_params_

{'ada__base_estimator__max_depth': 2,
 'gb__n_estimators': 100,
 'knn_pipe__knn__n_neighbors': 3}

In [11]:
gs2.best_params_

{'ada__base_estimator__max_depth': 2, 'gb__n_estimators': 100}

In [12]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state = 42)
gs.fit(X3_train, y3_train)
gs2.fit(X3_train, y3_train)
gs.best_score_, gs2.best_score_

(0.6163565352068416, 0.627079077910709)

In [13]:
gs.best_params_

{'ada__base_estimator__max_depth': 2,
 'gb__n_estimators': 100,
 'knn_pipe__knn__n_neighbors': 4}

In [14]:
gs2.best_params_

{'ada__base_estimator__max_depth': 2, 'gb__n_estimators': 100}