# Variables to change

In [1]:
import pandas as pd
from pathlib import Path
current_epsilon = 1.0
data_path = Path(f"data/train_{current_epsilon}.dat")

# Creating train and test targets

In [2]:
import sklearn.datasets as skd
import pandas as pd

In [3]:
a = skd.fetch_20newsgroups(data_home='.', remove=('headers'), random_state=0)
test = skd.fetch_20newsgroups(subset='test', data_home='.', remove=('headers'), random_state=0)

In [4]:
import os
train_names = [os.path.basename(f) for f in a.filenames]
test_names  = [os.path.basename(f) for f in test.filenames]

# Processing output from _Parma_ to be able to train

In [5]:
def build_vocabulary_map():
    with open('vocabulary.txt', 'r') as vf:
        m = {v: i for i,v in enumerate(vf.read().splitlines())}
    return m

def extract_docId(key):
    split = key.split('_')
    doc, word = split[0], split[1]
    return doc

def extract_wordId(key, vocab):
    split = key.split('_')
    doc, word = split[0], split[1]
    return vocab.get(word, -1)

In [6]:
df = pd.read_csv(data_path, header=0, names=['key','count'], sep='\t', index_col=False)
vocab = build_vocabulary_map()
df['docId'] = df['key'].apply(lambda r: extract_docId(r)).astype(str)
df['wordId'] = df['key'].apply(lambda r: extract_wordId(r, vocab))
print(df.head())

              key     count  docId  wordId
0     10000_chong  3.016004  10000   19438
1  10000_frampton  2.229746  10000   19142
2        10000_in  2.822721  10000      29
3       10000_not -1.216612  10000     721
4     10000_steve  0.733624  10000    3645


In [7]:
df.nunique()

key       1027120
count     1025774
docId        9818
wordId      41667
dtype: int64

In [8]:
df[['docId','wordId','count']].to_csv('processed.csv', index=False, header=False)
df[['docId','wordId','count']].to_csv('processed_no_header.tsv', sep='\t', index=False, header=False)

In [9]:
df.describe()

Unnamed: 0,count,wordId
count,1027120.0,1027120.0
mean,1.871309,5414.886
std,3.852283,9839.369
min,-6.599269,0.0
25%,0.5466165,455.0
50%,1.309618,1234.0
75%,2.419032,5148.0
max,106.1066,54901.0


# Creating training data

In [10]:
import pandas as pd
from scipy.sparse import lil_matrix
import numpy as np

In [11]:
train_df = df[['docId','wordId','count']]

In [12]:
unique_docids = set(train_df['docId'].values.tolist())

In [13]:
docid_to_ix = {docid: i for i, docid in enumerate(unique_docids)}

In [14]:
def create_csr_matrix_from_output(dataframe, vocab, docid_to_ix_mapping):
    M = lil_matrix( (len(docid_to_ix_mapping), len(vocab)), dtype=np.float)
    for row in dataframe.itertuples():
        ix = docid_to_ix_mapping[row.docId]
        M[ix, row.wordId] = row.count
    return M

In [15]:
csr = create_csr_matrix_from_output(train_df, vocab, docid_to_ix)

In [16]:
import os
target = np.full((csr.shape[0],), fill_value=-1, dtype=np.int)
train_names = [os.path.basename(f) for f in a.filenames]
docid_to_target = dict(zip(train_names, a.target))

target = np.full((csr.shape[0],), fill_value=-1, dtype=np.int)
for docId, ix_in_csr in docid_to_ix.items():
    catId = docid_to_target.get(docId)
    if catId is not None:
        target[ix_in_csr] = catId

In [17]:
assert csr.shape[0] == len(target) and target.min() >= 0

In [18]:
target

array([ 9, 13, 19, ..., 15,  8, 19])

# Training loop
Taken from https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py

In [19]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
# categories = [
#     'alt.atheism',
#     'talk.religion.misc',
# ]
# Uncomment the following to do the analysis on all the categories
categories = None

# print("Loading 20 newsgroups dataset for categories:")
# print(categories)

In [20]:
# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [21]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
#     'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
#     'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

In [22]:
# multiprocessing requires the fork to happen in a __main__ protected
# block

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(csr, target)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (20,),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   52.4s finished


done in 57.884s

Best score: 0.714
Best parameters set:
	clf__alpha: 1e-06
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	tfidf__norm: 'l1'
	tfidf__use_idf: True




In [23]:
# Save the model for this epsilon
import pickle
with open('best_clf_sgd_{}'.format(current_epsilon), 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

# Assessment
## Creating data that compatible with the pipeline

In [24]:
def clean_text(s):
    r = s.replace('\n', ' ').lower().replace(',',' ').replace('  ', ' ')
    if r == '':
        r = ' '
    return r

In [25]:
import os
# We need to recreate a CSR matrix from the test data
def create_test_data(sklearn_data, vocab):
    
    test_names  = set(os.path.basename(f) for f in sklearn_data.filenames)
    docid_to_ix_mapping = {docid: i for i, docid in enumerate(test_names)}
    preprocessed_strings = dict()
    
    # Preprocess the target's text as if it would come from Parma
    for docPath, d in zip(sklearn_data.filenames, sklearn_data.data):
        docId = os.path.basename(docPath)
        clean_content = clean_text(d)
        if docId not in preprocessed_strings:
            preprocessed_strings[docId] = clean_content
    
    # Create test word count mapping
    M = lil_matrix( (len(docid_to_ix_mapping), len(vocab)), dtype=np.float)
    for docId, csr_ix in docid_to_ix_mapping.items():
        associated_text = preprocessed_strings.get(docId)
        if associated_text is None:
            raise ValueError("Should not happen")
            
        for token in associated_text.split(' '):
            token_ix = vocab.get(token)
            if token_ix is not None:
                M[csr_ix, token_ix] += 1

    # Create test target data
    test_target = np.full((M.shape[0],), fill_value=-1, dtype=np.int)
#     assert len(test_target) == len(sklearn_data.target) and len(test_target) == M.shape[0]
    
    test_names = [os.path.basename(f) for f in sklearn_data.filenames]
    docid_to_target = dict(zip(test_names, sklearn_data.target))

    for docId, csr_ix in docid_to_ix_mapping.items():
        catId = docid_to_target.get(docId)
        if catId is not None:
            test_target[csr_ix] = catId
        else:
            print(docId, csr_ix)
            print
            raise ValueError("Should not happen")
            
    return M, test_target

In [26]:
test_csr, test_target = create_test_data(test, vocab)

In [27]:
test_target.shape

(6871,)

## Metrics

In [28]:
best_clf = grid_search.best_estimator_

In [29]:
from sklearn.metrics import accuracy_score

pred = best_clf.predict(test_csr)

In [30]:
accuracy_score(y_true=test_target, y_pred=pred)

0.647358463105807

In [31]:
best_clf.predict_log_proba(test_csr[:10])

AttributeError: probability estimates are not available for loss='hinge'