# Test features with sklearn

In [15]:
from sklearn.feature_extraction.text import *
from sklearn.pipeline import Pipeline

In [2]:
data = [
    "Le Barbecue Disney - La chanson de Frédéric Fromet",
    "Le Roi et l'Oiseau - La Chronique de Christine Gonzalez",
    "L'amour du lac - La chronique d'Hippolyte Girardot",
]

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
vectorizer.get_feature_names_out()

array(['amour', 'barbecue', 'chanson', 'christine', 'chronique', 'de',
       'disney', 'du', 'et', 'fromet', 'frédéric', 'girardot', 'gonzalez',
       'hippolyte', 'la', 'lac', 'le', 'oiseau', 'roi'], dtype=object)

In [4]:
X.toarray()

array([[0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0]],
      dtype=int64)

In [10]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(3, 3))
X2 = vectorizer2.fit_transform(data)
vectorizer2.get_feature_names_out()

array(['amour du lac', 'barbecue disney la', 'chanson de frédéric',
       'chronique de christine', 'chronique hippolyte girardot',
       'de christine gonzalez', 'de frédéric fromet', 'disney la chanson',
       'du lac la', 'et oiseau la', 'la chanson de', 'la chronique de',
       'la chronique hippolyte', 'lac la chronique', 'le barbecue disney',
       'le roi et', 'oiseau la chronique', 'roi et oiseau'], dtype=object)

In [6]:
X2.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [14]:
vectorizer3 = CountVectorizer(analyzer='char_wb', ngram_range=(3, 3))
X3 = vectorizer3.fit_transform(data)
vectorizer3.get_feature_names_out()

array([' - ', ' ba', ' ch', " d'", ' de', ' di', ' du', ' et', ' fr',
       ' gi', ' go', " l'", ' la', ' le', ' ro', "'am", "'hi", "'oi",
       'ac ', 'ale', 'amo', 'ans', 'arb', 'ard', 'au ', 'bar', 'bec',
       'cha', 'chr', 'cue', "d'h", 'de ', 'dis', 'dot', 'du ', 'dér',
       'eau', 'ecu', 'et ', 'ey ', 'ez ', 'fro', 'fré', 'gir', 'gon',
       'han', 'hip', 'hri', 'hro', 'ic ', 'ine', 'ipp', 'iqu', 'ira',
       'ise', 'isn', 'ist', "l'a", "l'o", 'la ', 'lac', 'le ', 'lez',
       'lyt', 'met', 'mou', 'ne ', 'ney', 'niq', 'nso', 'nza', 'oi ',
       'ois', 'oly', 'ome', 'on ', 'oni', 'onz', 'ot ', 'our', 'pol',
       'ppo', 'que', 'rar', 'rbe', 'rdo', 'ric', 'ris', 'roi', 'rom',
       'ron', 'réd', 'sea', 'sne', 'son', 'sti', 'te ', 'tin', 'ue ',
       'ur ', 'yte', 'zal', 'édé', 'éri'], dtype=object)

In [16]:
pipe = Pipeline([
    ('count', vectorizer3), 
    ('tfid', TfidfTransformer())
]).fit(data)

In [17]:
pipe['count'].transform(data).toarray()

array([[1, 1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1],
       [1, 0, 2, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 2, 0, 0, 1, 1, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0

In [21]:
pipe['tfid'].idf_

array([1.        , 1.69314718, 1.        , 1.69314718, 1.28768207,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.28768207, 1.        , 1.28768207, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.28768207, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.69314718, 1.28768207, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.        ,
       1.69314718, 1.28768207, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314

In [19]:
pipe.transform(data).shape

(3, 104)