In [None]:
import pandas as pd

corpus = pd.read_csv('quotes.csv', quotechar='"', quoting=0)

# 1. Load

In [None]:
corpus = corpus.loc[corpus['author'].apply(lambda s: (type(s) is str and s[0].isupper()))]

In [None]:
corpus['author'] = corpus['author'].apply(lambda s: s.replace(', Jr.', ' Jr.'))
corpus[['author','work']] = corpus['author'].str.split(',', n=1, expand=True)
corpus = corpus.loc[~corpus['author'].apply(lambda s: len(s) > 30)]

In [None]:
corpus['author'].nunique()

In [None]:
corpus['author'].value_counts()[:20]

In [None]:
corpus_classical = pd.read_csv('quotes_all.csv', sep=';')

In [None]:
corpus_classical = corpus_classical.groupby(['quote'])[['author','category']].agg({'author':'first', 'category':list}).reset_index()

In [None]:
corpus.loc[(corpus['author'] == 'Victor Hugo')]

In [None]:
corpus = corpus_classical

# 2. Compute

### Disambiguate author names

In [None]:
import textdistance as td
from tqdm import tqdm

authors = {}
for author in tqdm(corpus['author']):
    if author not in authors:
        for dict_author in authors.keys():
            if td.jaccard.normalized_similarity(dict_author,author) > 0.85:
                print(author, dict_author)
        authors[author] = True


## Remove profanity

In [None]:
from profanity_check import predict_prob

corpus = corpus.reset_index(drop=True).loc[
    ~(
        pd.Series(predict_prob(corpus['category'].astype('U').values) > 0.8)
        | pd.Series(predict_prob(corpus['category'].astype('U').values) > 0.8)
    )
]

## Filter out recent/unknown authors

In [None]:
import requests
import re
from tqdm import tqdm

def confirm_birth_year(author_request):
    if author_request['title'] != "Not found.":
        if 'description' in author_request.keys():
            date_search = re.search('[0-9]+', author_request['description'])
            if date_search is not None:
                birth_date = int(date_search.group(0))
                if birth_date < 1940:
                    return author_request['title'], author_request['description']
    return None

good_old_authors = {'Anonymous': "Someone said this one... We don't know who."}
eliminated_authors = []

In [None]:
authors = corpus['author'].unique()
for author in tqdm(authors):
    if author not in good_old_authors and author not in eliminated_authors:
        result = None
        try:
            author_request = requests.get(f'https://en.wikipedia.org/api/rest_v1/page/summary/{author.replace(" ", "_")}', timeout=2).json()
            result_tuple = confirm_birth_year(author_request)
        except:
            print("Fail for author", author)
        if result_tuple is None:
            eliminated_authors.append(author)
        else:
            good_old_authors[result_tuple[0]] = result_tuple[1]

In [None]:
[key for key in good_old_authors.keys() if "e." in key]

In [None]:
pd.Series(good_old_authors)

In [None]:
corpus = corpus[corpus['author'].isin(good_old_authors)]

In [None]:
len(eliminated_authors)

# Export

In [None]:
corpus.to_csv('quotes_classical_clean.csv', sep='|', index=False)

# End of notebook