In [2]:
import numpy as np
from nltk.stem import SnowballStemmer
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# nltk.download('stopwords')
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier

In [15]:
class StemmedCountVectorizer(CountVectorizer):
    stemmer = SnowballStemmer("english", ignore_stopwords=True)

    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([self.stemmer.stem(w) for w in analyzer(doc)])

In [16]:
def train_test_pipeline(pipe, pipe_parameters):
    train = fetch_20newsgroups(subset='train', shuffle=True)
    gs_clf = GridSearchCV(pipe, pipe_parameters, cv=5, n_jobs=-1)
    gs_clf = gs_clf.fit(train.data, train.target)
    print("Best score: %s" % gs_clf.best_score_)
    print("Best param: %s" % gs_clf.best_params_)
    print()


In [19]:
pipe_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('rf', RandomForestClassifier())
])

In [20]:
parameters = {
    'vect__ngram_range': [(1, 2), (2, 2)],
    # 'rf__max_depth': (None, 20, 40)
}

In [21]:
train_test_pipeline(pipe=pipe_rf, pipe_parameters=parameters)

Best score: 0.6407123531194447
Best param: {'vect__ngram_range': (1, 2)}



In [17]:
pipe_rf_2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer()),
    ('rf', RandomForestClassifier(max_depth=None))
])

In [12]:
train_test_pipeline(pipe=pipe_rf_2, pipe_parameters={})

KeyboardInterrupt: 

In [23]:
train = fetch_20newsgroups(subset='train', shuffle=True)
test = fetch_20newsgroups(subset='test', shuffle=True)
pipe_rf_2.fit(train.data, train.target)
predicted_stemmed = pipe_rf_2.predict(test.data)

print('Accuracy after stemming: %s' % np.mean(predicted_stemmed == test.target))

Accuracy after stemming: 0.7612851832182688


# More flexible approach

In [6]:
from sklearn.datasets import fetch_20newsgroups

groups = fetch_20newsgroups()

In [7]:
data_train = fetch_20newsgroups(subset='train', random_state=21)
train_label = data_train.target
data_test = fetch_20newsgroups(subset='test', random_state=21)
test_label = data_test.target
len(data_train.data), len(data_test.data), len(test_label)

(11314, 7532, 7532)

In [8]:
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names

all_names = names.words()
WNL = WordNetLemmatizer()


def clean(data):
    cleaned = defaultdict(list)
    count = 0
    for group in data:
        for words in group.split():
            if words.isalpha() and words not in all_names:
                cleaned[count].append(WNL.lemmatize(words.lower()))
        cleaned[count] = ' '.join(cleaned[count])
        count += 1
    return (list(cleaned.values()))

In [9]:
x_train = clean(data_train.data)
x_test = clean(data_test.data)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = tf.fit_transform(x_train)
X_test = tf.transform(x_test)
X_train.shape, X_test.shape

((11314, 1000), (7532, 1000))

In [11]:
def search_train_accuracy(pipeline, parameters):
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(x_train, train_label)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    best_clf = grid_search.best_estimator_
    print(f'Accuracy: {best_clf.score(x_test, test_label)}')

# LINEAR SVC

In [117]:
pipeline_linsvc = Pipeline([('tf_id', TfidfVectorizer(stop_words="english")), ('linsvc', LinearSVC())])

parameters_linsvc = {'tf_id__max_features': [10000, 15000],
             'tf_id__max_df': [0.3],
             'tf_id__ngram_range': [(1,2)],
             'tf_id__smooth_idf': [False],
             'tf_id__sublinear_tf': [True]
             }

In [119]:
search_train_accuracy(pipeline=pipeline_linsvc, parameters=parameters_linsvc)

{'tf_id__max_df': 0.3, 'tf_id__max_features': 10000, 'tf_id__ngram_range': (1, 2), 'tf_id__smooth_idf': False, 'tf_id__sublinear_tf': True}
0.8710451353415336
Accuracy: 0.7830589484864577


# POLY SVC

In [123]:
pipeline_svc = Pipeline([('tf_id', TfidfVectorizer(stop_words="english")), ('svc', SVC())])

parameter_svc = {'tf_id__max_features': [10000, 15000],
                 'tf_id__max_df': [0.3],
                 'tf_id__ngram_range': [(1, 2)],
                 'tf_id__smooth_idf': [False],
                 'tf_id__sublinear_tf': [True],
                 'svc__kernel': ['linear', 'poly', 'sigmoid']
                 }
search_train_accuracy(pipeline=pipeline_svc, parameters=parameter_svc)

{'svc__kernel': 'linear', 'tf_id__max_df': 0.3, 'tf_id__max_features': 15000, 'tf_id__ngram_range': (1, 2), 'tf_id__smooth_idf': False, 'tf_id__sublinear_tf': True}
0.871487385967907
Accuracy: 0.7826606479022836


# Random Forest Classifier

In [120]:
pipeline_rfc = Pipeline([('tf_id', TfidfVectorizer(stop_words="english")), ('rfc', RandomForestClassifier())])

parameter_rfc = {'tf_id__max_features': [10000],
                 'tf_id__max_df': [0.3],
                 'tf_id__ngram_range': [(1,2)],
                 'tf_id__smooth_idf': [False],
                 'tf_id__sublinear_tf': [True],
                 'rfc__max_depth': [None]
                 }
search_train_accuracy(pipeline=pipeline_rfc, parameters=parameter_rfc)

{'rfc__max_depth': None, 'tf_id__max_df': 0.3, 'tf_id__max_features': 10000, 'tf_id__ngram_range': (1, 2), 'tf_id__smooth_idf': False, 'tf_id__sublinear_tf': True}
0.772582762405397
Accuracy: 0.6776420605416887


# XGBoost Classifier

In [None]:
pipeline_xgb = Pipeline([('tf_id', TfidfVectorizer(stop_words="english")), ('xgb', XGBClassifier())])

parameter_xgb = {'tf_id__max_features': [10000],
                 'tf_id__max_df': [0.3],
                 'tf_id__ngram_range': [(1,2)],
                 'tf_id__smooth_idf': [False],
                 'tf_id__sublinear_tf': [True],
                 'xgb__max_depth': [3,6,10],
                 'xgb__learning_rate': [0.01, 0.05, 0.1],
                 'xgb__n_estimators': [100, 500, 1000],
                 'xgb__colsample_bytree': [0.3, 0.7],
                 }

search_train_accuracy(pipeline=pipeline_xgb, parameters=parameter_xgb)