## Indexing with sklearn


## loading a dataset

In [None]:
import urllib.request
import os

def download_file(url,local_file, force=False):
    """
    Helper function to download a file and store it locally
    """
    if not os.path.exists(local_file) or force:
        print('Downloading',url,'to',local_file)
        with urllib.request.urlopen(url) as opener, \
             open(local_file, mode='w', encoding='utf-8') as outfile:
                    outfile.write(opener.read().decode('utf-8'))
    else:
        print(local_file,'already downloaded')

In [None]:
train_file = 'news_en_train.txt'
train_url='http://www.esuli.it/demo/data/news_en_train.csv'
test_file = 'news_en_test.txt'
test_url = 'http://www.esuli.it/demo/data/news_en_test.csv'
delimiter = ','

download_file(train_url, train_file)
download_file(test_url, test_file)

In [None]:
import csv
x_train = list()
y_train = list()
with open(train_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter=delimiter)
    for row in reader:
        x_train.append(row[0])
        y_train.append(row[1])

x_test = list()
y_test = list()
with open(test_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter=delimiter)
    for row in reader:
        x_test.append(row[0])
        y_test.append(row[1])


In [None]:
len(x_train),len(y_train),len(x_test),len(y_test)

In [None]:
set(y_train)

In [None]:
sample_idx = 10
x_train[sample_idx]

In [None]:
y_train[sample_idx]

# Binary classification

This is a multi-class single-label dataset.
We start with a simpler binary classification problem, e.g., economy vs not economy.

Just to make a choice, we use as the reference label the one of the example in the cell above.

In [None]:
import numpy as np

# numpy implements many useful and powerful vector manipulation tools
# here I'm using it to quickly create a True,False vector corresponding
# to the original values being equal to our label of interest or not
# i.e., binary labels

y_train_bin = np.asarray(y_train)==y_train[sample_idx]
y_test_bin = np.asarray(y_test)==y_train[sample_idx]
y_train_bin,y_test_bin

## Building the pipeline by hand

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

## Tokenization

Try the following two cells removing the min_df parameter

In [None]:
vect = CountVectorizer(min_df=5)  # tokenization and frequency count

print('fit')
vect.fit(x_train)
print('transform')
X_train_tok = vect.transform(x_train)
print('done')

# the two steps above can be condensed in a single step that processes train
# data only once.

# print('fit_transform')
# X_train_tok = vect.fit_transform(x_train)
# print('done')

X_test_tok =vect.transform(x_test)

In [None]:
len(vect.vocabulary_)

In [None]:
vect.vocabulary_

In [None]:
vect.get_feature_names()

In [None]:
X_train_tok[0,:]

In [None]:
print(X_train_tok[0,:])

Some scikit-learn modules implement an inverse_transform method to reconstruct input from their output.
Let's print out the feature names and their frequency for a document. Note that frequency info is lost.

In [None]:
vect.inverse_transform(X_train_tok[0,:])

Let's attach frequency data to features

In [None]:
for feat,freq in zip(vect.inverse_transform(X_train_tok[0,:])[0],X_train_tok[0,:].data):
  print(feat,freq)

## Feature selection

This is the first element where we use the labels, because it is a supervised method.

In [None]:
bin_sel = SelectKBest(chi2, k=5000)  # feature selection
bin_sel.fit(X_train_tok,y_train_bin)
X_train_sel_bin = bin_sel.transform(X_train_tok)
X_test_sel_bin = bin_sel.transform(X_test_tok)

In [None]:
bin_sel.get_support()

In [None]:
X_train_sel_bin

In [None]:
X_train_sel_bin[0,:]

In [None]:
print(X_train_sel_bin[0,:])

The feature selection module has an inverse transform method so that we can map selected feature back to the original large feature space

In [None]:
bin_sel.inverse_transform(X_train_sel_bin[0,:])

In [None]:
print(vect.inverse_transform(bin_sel.inverse_transform(X_train_sel_bin[0,:])))

## Weighting

In [None]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel_bin)
X_train_vec_bin = tfidf.transform(X_train_sel_bin)
X_test_vec_bin =tfidf.transform(X_test_sel_bin)

In [None]:
print(X_train_vec_bin[0,:])

In [None]:
for feat,weight,freq in zip(vect.inverse_transform(bin_sel.inverse_transform(X_train_vec_bin[0,:]))[0],X_train_vec_bin[0,:].data,X_train_sel_bin[0,:].data):
  print(feat,weight,freq)