In [1]:
import numpy as np
from sklearn.datasets import load_files

In [2]:
# The training data folder must be passed as first argument
try:
  dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
  print(ex)
  print("Couldn't import the data, did you unzip the wikidata.zip folder?")
  exit(-1)

In [4]:
docs = dataset.data
labels = dataset.target

In [6]:
docs[:5]

[b'par les diff\xc3\xa9rentes communaut\xc3\xa9s de langue,',
 b"l'episodio. I suoi redattori hanno",
 b'especie de legitimidad democr\xc3\xa1tica. Por',
 b'Britannica; sin embargo, ninguna ha',
 b'el mundo,[4] y pr\xc3\xa1cticamente cualquier']

In [7]:
labels[:5]

array([4, 5, 3, 3, 3])

In [8]:
# TASK: Split the dataset in training and test set
# Use 20% of the data for test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
docs_train, docs_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2, random_state=42)

In [11]:
# TASK: Build a an vectorizer that splits
# strings into sequence of 1 to 3
# characters instead of word tokens
# using the class TfidfVectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vec = TfidfVectorizer(ngram_range=(1,3), analyzer="char")

In [19]:
# TASK: Use the function make_pipeline to build a
#       vectorizer / classifier pipeline
#       using the previous analyzer
#       and a classifier of choice.
#       The pipeline instance should be
#       stored in a variable named model

In [22]:
# WITHOUT A PIPELINE IT WILL BE
vec.fit(docs_train)
vec.transform(docs_test[:5])

<5x55115 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [26]:
# WITH A PIPELINE WE WILL CREATE A PIPELINE (A BLOCK) WITH [VECTORIZER + MODEL ESTIMATOR] WHICH RECEIVE PHRASES AND RETURN OUTPUTS
from sklearn.pipeline import make_pipeline
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

model = make_pipeline(vec, clf)

In [27]:
# TASK: Fit the pipeline on the training set
model.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_i...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [28]:
# TASK: Predict the outcome on the testing set.
# Store the result in a variable named y_predicted

y_predicted = model.predict(docs_test)

In [38]:
# TASK: Print the classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          0       1.00      0.60      0.75        62
          1       0.93      0.91      0.92       193
          2       0.93      0.91      0.92       221
          3       0.79      0.80      0.80       213
          4       0.91      0.83      0.87       223
          5       0.83      0.81      0.82       198
          6       0.91      0.55      0.69        78
          7       0.90      0.66      0.76       112
          8       0.81      0.86      0.83       207
          9       0.62      0.98      0.76       185

avg / total       0.85      0.83      0.83      1692



In [34]:
# TASK: Print the confusion matrix. Bonus points if you make it pretty.
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_predicted)

In [35]:
import pandas as pd

In [40]:
print(pd.DataFrame(cm, index=dataset.target_names, columns=["p_" + c for c in dataset.target_names]))

    p_ar  p_de  p_en  p_es  p_fr  p_it  p_nl  p_pl  p_pt  p_ru
ar    37     1     0     0     0     0     2     0     0    22
de     0   176     0     2     3     5     0     0     1     6
en     0     3   202     0     2     6     2     2     0     4
es     0     1     0   170     1     8     0     0    21    12
fr     0     0     1    18   184     4     0     0     9     7
it     0     3     1     6     8   161     0     0     4    15
nl     0     4     6    12     0     7    43     1     2     3
pl     0     1     2     0     1     1     0    74     3    30
pt     0     0     3     6     3     1     0     5   177    12
ru     0     0     3     0     0     0     0     0     1   181


In [41]:
def analyze (vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3)), classifier = RandomForestClassifier()) :
    """Build a language detector model

    The goal of this exercise is to train a linear classifier on text features
    that represent sequences of up to N consecutive characters so as to be
    recognize natural languages by using the frequencies of short character
    sequences as 'fingerprints'.

    The script saves the trained model to disk for later use
    """
    # TASK: Split the dataset in training and test set
    # (use 20% of the data for test):
    docs_train, docs_test, y_train, y_test = \
        train_test_split(docs, labels, test_size=0.2, random_state=42)


    # TASK: Build a an vectorizer that splits
    # strings into sequence of 1 to 3
    # characters instead of word tokens
    # using the class TfidfVectorizer
    vec = vectorizer


    # TASK: Use the function make_pipeline to build a
    #       vectorizer / classifier pipeline
    #       using the previous analyzer
    #       and a classifier of choice.
    #       The pipeline instance should be
    #       stored in a variable named model
    clf = classifier
    model = make_pipeline(vec, clf)

    # TASK: Fit the pipeline on the training set
    model.fit(docs_train, y_train)

    # TASK: Predict the outcome on the testing set.
    # Store the result in a variable named y_predicted
    y_predicted = model.predict(docs_test)

    # TASK: Print the classification report
    print(classification_report(y_test, y_predicted))

    # TASK: Print the confusion matrix. Bonus points if you make it pretty.
    cm = confusion_matrix(y_test, y_predicted)

    idx = dataset.target_names
    cols = ['p_'+c for c in dataset.target_names]
    print(pd.DataFrame(cm, index=idx, columns=cols))


In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [48]:
vectorizers = [
#     TfidfVectorizer(analyzer='char', ngram_range=(1, 3)),
#     TfidfVectorizer(analyzer='char', ngram_range=(2, 3)),
    TfidfVectorizer(analyzer='char', ngram_range=(2, 4)),
#     TfidfVectorizer(analyzer='char', ngram_range=(1, 4))
]

models = [
#     RandomForestClassifier(max_depth=20),
#     DecisionTreeClassifier(max_depth=20),
#     MLPClassifier(),
    LogisticRegression(),
    LogisticRegression(C=10)
]

for vectorizer in vectorizers:
    for model in models:
        analyze(vectorizer=vectorizer, classifier=model)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        62
          1       0.97      0.98      0.98       193
          2       0.93      0.99      0.96       221
          3       0.94      0.94      0.94       213
          4       0.97      0.99      0.98       223
          5       0.97      0.97      0.97       198
          6       1.00      0.83      0.91        78
          7       1.00      0.97      0.99       112
          8       0.95      0.94      0.95       207
          9       1.00      0.98      0.99       185

avg / total       0.97      0.97      0.97      1692

    p_ar  p_de  p_en  p_es  p_fr  p_it  p_nl  p_pl  p_pt  p_ru
ar    62     0     0     0     0     0     0     0     0     0
de     0   190     2     1     0     0     0     0     0     0
en     0     1   219     0     1     0     0     0     0     0
es     0     0     0   201     0     4     0     0     8     0
fr     0     0     1     1   221     0     0  