In [21]:
import pandas as pd
import os
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:
file_path = os.path.join(os.getcwd(), 'tutorial', 'quotes.json')

In [3]:
quotes = pd.read_json(file_path)

In [4]:
quotes.head()

Unnamed: 0,text,author,tags
0,Be yourself; everyone else is already taken.,Oscar Wilde,"[attributed-no-source, be-yourself, gilbert-pe..."
1,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"[attributed-no-source, best, life, love, misat..."
2,"So many books, so little time.",Frank Zappa,"[books, humor]"
3,Two things are infinite: the universe and huma...,Albert Einstein,"[attributed-no-source, human-nature, humor, in..."
4,A room without books is like a body without a ...,Marcus Tullius Cicero,"[attributed-no-source, books, simile, soul]"


In [5]:
quotes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3000 non-null   object
 1   author  3000 non-null   object
 2   tags    3000 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


Prepare the features and labels

In [6]:
text = quotes['text']
text.head()

0         Be yourself; everyone else is already taken.
1    I'm selfish, impatient and a little insecure. ...
2                       So many books, so little time.
3    Two things are infinite: the universe and huma...
4    A room without books is like a body without a ...
Name: text, dtype: object

In [7]:
tags = quotes['tags']
tags.head()

0    [attributed-no-source, be-yourself, gilbert-pe...
1    [attributed-no-source, best, life, love, misat...
2                                       [books, humor]
3    [attributed-no-source, human-nature, humor, in...
4          [attributed-no-source, books, simile, soul]
Name: tags, dtype: object

In [8]:
mlb = MultiLabelBinarizer()

In [9]:
tags_binarized = mlb.fit_transform(tags)
tags_binarized

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(3000, 2061))

In [10]:
mlb.classes_

array(['1-corinthians-13', '1929', '1931', ..., 'zoe-nightshade',
       'zombies', 'حب'], shape=(2061,), dtype=object)

In [11]:
vectorizer = TfidfVectorizer(max_features=3000,
                             stop_words='english',
                             ngram_range=(1, 2),
                             min_df=1,
                             lowercase=True)

In [12]:
text_vector = vectorizer.fit_transform(text)
text_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 22527 stored elements and shape (3000, 3000)>

In [13]:
vectorizer.get_feature_names_out()

array(['000', 'ability', 'able', ..., 'هو', 'هي', 'ولا'],
      shape=(3000,), dtype=object)

In [22]:
model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100,))

In [23]:
model.fit(text_vector, tags_binarized)

0,1,2
,estimator,RandomForestClassifier()
,n_jobs,
,verbose,0

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
predictions = model.predict(text_vector)

In [25]:
print(classification_report(tags_binarized,
                            predictions,
                            target_names=mlb.classes_,
                            zero_division=0))

                                  precision    recall  f1-score   support

                1-corinthians-13       1.00      1.00      1.00         1
                            1929       1.00      1.00      1.00         1
                            1931       1.00      1.00      1.00         1
                            1946       1.00      1.00      1.00         1
                            1955       1.00      1.00      1.00         1
                            1970       1.00      1.00      1.00         1
                            1993       1.00      1.00      1.00         1
                            1997       1.00      1.00      1.00         1
                            2008       1.00      1.00      1.00         1
                            2013       1.00      1.00      1.00         1
        a-court-of-mist-and-fury       1.00      1.00      1.00         4
                    aaron-warner       1.00      1.00      1.00         1
                             abe     

In [26]:
joblib.dump(model, os.path.join(os.getcwd(), 'models', 'quotes_model.pkl'))
joblib.dump(vectorizer, os.path.join(os.getcwd(), 'models', 'vectorizer.pkl'))
joblib.dump(mlb, os.path.join(os.getcwd(), 'models', 'mlb.pkl'))

['/Users/corentindupriez/Documents/Python/scraper_test/models/mlb.pkl']