In [None]:
# default_exp data

Ideally the DATA_PATH would be configurable

In [None]:
#export
from pathlib import Path
import pickle
import pandas as pd
from urllib.request import urlretrieve
import urllib.parse
from tqdm.auto import tqdm

import typing as T

DATA_PATH = (Path.home() / ".cache") / "mlzero"

In [None]:
pd.options.display.max_columns = 200
pd.options.display.max_colwidth = 100
pd.options.display.html.use_mathjax = False

# Data Source Fetchers

## Kaggle

We don't use this in CI because it requires Kaggle Credentials, but is the original source of some datasets.

Note we move the import to the function because even it will fail if there's not API credentials.

In [None]:
def fetch_from_kaggle(dataset: str, filename: str, data_path: Path = DATA_PATH, force: bool = False) -> pd.DataFrame:
    """Loads single dataset from Kaggle, downloading if necessary.
    Requires Kaggle API credentials: https://github.com/Kaggle/kaggle-api#api-credentials
    Intermediate files are stored in data_path.
    """
    from kaggle.api.kaggle_api_extended import KaggleApi
    dest = Path(data_path) / (urllib.parse.quote(filename) + '.zip')
    if force or not dest.exists():
        Path(data_path).mkdir(exist_ok=True, parents=True)
        kaggle = KaggleApi()
        kaggle.authenticate()
        kaggle.dataset_download_file(dataset, filename, path=data_path)
    return pd.read_csv(dest)

Progress Bar
Taken from https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py

In [None]:
class TqdmUpTo(tqdm):
    """Alternative Class-based version of the above.
    Provides `update_to(n)` which uses `tqdm.update(delta_n)`.
    Inspired by [twine#242](https://github.com/pypa/twine/pull/242),
    [here](https://github.com/pypa/twine/commit/42e55e06).
    """

    def update_to(self, b=1, bsize=1, tsize=None):
        """
        b  : int, optional
            Number of blocks transferred so far [default: 1].
        bsize  : int, optional
            Size of each block (in tqdm units) [default: 1].
        tsize  : int, optional
            Total size (in tqdm units). If [default: None] remains unchanged.
        """
        if tsize is not None:
            self.total = tsize
        return self.update(b * bsize - self.n)  # also sets self.n = b * bsize

TODO: Don't allow partial/corrupted downloads

In [None]:
#export
def fetch_dataset(filename: str, data_path: Path = DATA_PATH, force: bool = False, data_loader=pd.read_csv) -> pd.DataFrame:
    """Loads a stored dataset, with  a progress bar"""
    dest = Path(data_path) / filename
    if force or not dest.exists():
        Path(data_path).mkdir(exist_ok=True, parents=True)
        url = f'https://skeptric.com/datasets/{filename}'
        with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t:
            urlretrieve(url, dest, reporthook=t.update_to)
    return data_loader(dest)

In [None]:
#export
def pickle_loader(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

# Text Data Sources

## AU Jobs: Export this somewhere

Data can be automatically extracted using [job-advert-analysis](https://github.com/EdwardJRoss/job-advert-analysis); just need somewhere suitable to store the output.

In [None]:
#export
def data_au_jobs(data_path: Path = DATA_PATH, force: bool = False) -> pd.DataFrame:
    """Gets Australian Job Ads
    
    License: CC BY-NC-SA 4.0
    See https://github.com/EdwardJRoss/job-advert-analysis
    """
    return fetch_dataset('au_jobs.pkl', data_path, force=force, data_loader=pickle_loader)

In [None]:
data_au_jobs()

Unnamed: 0,title,description,uri,view_date,org,salary_raw,salary_min,salary_max,salary_hours,location_raw,loc_id,loc_continent,loc_country,loc_county,loc_empire,loc_localadmin,loc_locality,loc_macrocounty,loc_region,processor,source,loc_neighbourhood,salary_valid,salary_hours_inferred,salary_annual
0,Outbound phone consultant,About the Role:\nDavidson are partnered with a client that have two upcoming campaigns requiring...,https://www.davidsonwp.com/job/100562133233487/outbound-phone-consultant/,2020-06-06 06:29:05+00:00,Davidson,Competitive,,,,"Brisbane, AU",101934019.0,Oceania,Australia,Brisbane,Australia,Brisbane City,Brisbane City,Brisbane,Queensland,davidsonwp,CC-MAIN-2020-24,,False,,
1,Business Analyst,About the Company\nAre you ready to start your next challenge within one of the leading financia...,https://www.davidsonwp.com/job/100562133234964/business-analyst-16/,2020-06-03 00:11:31+00:00,Davidson,Competitive,,,,"Sydney, AU",101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,davidsonwp,CC-MAIN-2020-24,,False,,
2,Senior Database Administrator - 3 month initial contract,About the Company\nAn opportunity to join a leading organisation working as a Database Administr...,https://www.davidsonwp.com/job/100562133234550/senior-database-administrator/,2020-05-30 20:04:38+00:00,Davidson,$100000 per annum,100000.0,,2000.0,"Melbourne, AU",101933229.0,Oceania,Australia,Melbourne,Australia,Melbourne,Melbourne,Melbourne,Victoria,davidsonwp,CC-MAIN-2020-24,,True,2000.0,100000.0
3,GIS Supervisor,At Davidson our vision is quite simply to change the face of the human resources and recruitment...,https://www.davidsonwp.com/job/100562133234519/gis-supervisor/,2020-06-06 07:00:26+00:00,Davidson,$70000 - $81000 per annum,70000.0,81000.0,2000.0,"Moss Vale, AU",101939833.0,Oceania,Australia,Wingecarribee,Australia,Moss Vale,Moss Vale,Bowral - Mittagong,New South Wales,davidsonwp,CC-MAIN-2020-24,,True,2000.0,70000.0
4,Project Manager,The client is seeking an experienced Infrastructure Project Manager.\nThe successful candidate w...,https://www.davidsonwp.com/job/100562133233911/project-manager-27/,2020-05-28 04:26:08+00:00,Davidson,Competitive,,,,"Brisbane, AU",101934019.0,Oceania,Australia,Brisbane,Australia,Brisbane City,Brisbane City,Brisbane,Queensland,davidsonwp,CC-MAIN-2020-24,,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73421,UX/UI Designer,Opportunity to design and deliver high quality user experience for MAAS online visitors\nTempora...,https://iworkfor.nsw.gov.au/job/ux-ui-designer-213266,2020-10-21 22:06:41+00:00,Trustees of the Museum of Applied Arts and Sciences,"Salary package up to $119,200. Package includes salary ($97,152 - $107,541) employer's contribut...",97152.0,107541.0,2000.0,Sydney Region / Sydney City,101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,iworkfornsw,CC-MAIN-2020-45,,True,2000.0,97152.0
73422,Membership of the Independent Metropolitan Water Advisory Panel,EXPRESSIONS OF INTEREST FOR MEMBERSHIP OF THE INDEPENDENT METROPOLITAN WATER ADVISORY PANEL\nThe...,https://iworkfor.nsw.gov.au/job/membership-of-the-independent-metropolitan-water-advisory-panel-...,2020-10-21 22:46:21+00:00,Water,-,,,,Sydney Region / Sydney City,101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,iworkfornsw,CC-MAIN-2020-45,,False,,
73423,Aboriginal Identified - Clerk General Scale - Cadetship Program - Various Locations - Temporary,"ABOUT LEGAL AID NSW\nLegal Aid NSW is the largest legal aid agency in Australia, comprising of a...",https://iworkfor.nsw.gov.au/job/aboriginal-identified-clerk-general-scale-cadetship-program-vari...,2020-10-21 22:35:08+00:00,"Legal Aid Commission, Office of the","• Salary package of up to $27,700 including study and book allowances for a 60-day placement",27700.0,,8.0,Sydney Region,101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,iworkfornsw,CC-MAIN-2020-45,,False,8.0,
73424,"Teacher Community Services, Orange","TAFE NSW Teacher – Community Services\n\nLocation: West Region, Orange\nPosition: Permanent Full...",https://iworkfor.nsw.gov.au/job/teacher-community-services-orange-212294,2020-10-21 21:43:25+00:00,TAFE NSW,"$110,799 Package includes salary ($84,664 - $100,407), employer's contribution to superannuation...",84664.0,100407.0,2000.0,Regional NSW / Bathurst & Central West NSW,102049305.0,Oceania,Australia,Bathurst Regional,Australia,,,,New South Wales,iworkfornsw,CC-MAIN-2020-45,,True,2000.0,84664.0


Should be quick on the second load

In [None]:
%time data_au_jobs()

CPU times: user 732 ms, sys: 406 ms, total: 1.14 s
Wall time: 1.18 s


Unnamed: 0,title,description,uri,view_date,org,salary_raw,salary_min,salary_max,salary_hours,location_raw,loc_id,loc_continent,loc_country,loc_county,loc_empire,loc_localadmin,loc_locality,loc_macrocounty,loc_region,processor,source,loc_neighbourhood,salary_valid,salary_hours_inferred,salary_annual
0,Outbound phone consultant,About the Role:\nDavidson are partnered with a client that have two upcoming campaigns requiring...,https://www.davidsonwp.com/job/100562133233487/outbound-phone-consultant/,2020-06-06 06:29:05+00:00,Davidson,Competitive,,,,"Brisbane, AU",101934019.0,Oceania,Australia,Brisbane,Australia,Brisbane City,Brisbane City,Brisbane,Queensland,davidsonwp,CC-MAIN-2020-24,,False,,
1,Business Analyst,About the Company\nAre you ready to start your next challenge within one of the leading financia...,https://www.davidsonwp.com/job/100562133234964/business-analyst-16/,2020-06-03 00:11:31+00:00,Davidson,Competitive,,,,"Sydney, AU",101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,davidsonwp,CC-MAIN-2020-24,,False,,
2,Senior Database Administrator - 3 month initial contract,About the Company\nAn opportunity to join a leading organisation working as a Database Administr...,https://www.davidsonwp.com/job/100562133234550/senior-database-administrator/,2020-05-30 20:04:38+00:00,Davidson,$100000 per annum,100000.0,,2000.0,"Melbourne, AU",101933229.0,Oceania,Australia,Melbourne,Australia,Melbourne,Melbourne,Melbourne,Victoria,davidsonwp,CC-MAIN-2020-24,,True,2000.0,100000.0
3,GIS Supervisor,At Davidson our vision is quite simply to change the face of the human resources and recruitment...,https://www.davidsonwp.com/job/100562133234519/gis-supervisor/,2020-06-06 07:00:26+00:00,Davidson,$70000 - $81000 per annum,70000.0,81000.0,2000.0,"Moss Vale, AU",101939833.0,Oceania,Australia,Wingecarribee,Australia,Moss Vale,Moss Vale,Bowral - Mittagong,New South Wales,davidsonwp,CC-MAIN-2020-24,,True,2000.0,70000.0
4,Project Manager,The client is seeking an experienced Infrastructure Project Manager.\nThe successful candidate w...,https://www.davidsonwp.com/job/100562133233911/project-manager-27/,2020-05-28 04:26:08+00:00,Davidson,Competitive,,,,"Brisbane, AU",101934019.0,Oceania,Australia,Brisbane,Australia,Brisbane City,Brisbane City,Brisbane,Queensland,davidsonwp,CC-MAIN-2020-24,,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73421,UX/UI Designer,Opportunity to design and deliver high quality user experience for MAAS online visitors\nTempora...,https://iworkfor.nsw.gov.au/job/ux-ui-designer-213266,2020-10-21 22:06:41+00:00,Trustees of the Museum of Applied Arts and Sciences,"Salary package up to $119,200. Package includes salary ($97,152 - $107,541) employer's contribut...",97152.0,107541.0,2000.0,Sydney Region / Sydney City,101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,iworkfornsw,CC-MAIN-2020-45,,True,2000.0,97152.0
73422,Membership of the Independent Metropolitan Water Advisory Panel,EXPRESSIONS OF INTEREST FOR MEMBERSHIP OF THE INDEPENDENT METROPOLITAN WATER ADVISORY PANEL\nThe...,https://iworkfor.nsw.gov.au/job/membership-of-the-independent-metropolitan-water-advisory-panel-...,2020-10-21 22:46:21+00:00,Water,-,,,,Sydney Region / Sydney City,101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,iworkfornsw,CC-MAIN-2020-45,,False,,
73423,Aboriginal Identified - Clerk General Scale - Cadetship Program - Various Locations - Temporary,"ABOUT LEGAL AID NSW\nLegal Aid NSW is the largest legal aid agency in Australia, comprising of a...",https://iworkfor.nsw.gov.au/job/aboriginal-identified-clerk-general-scale-cadetship-program-vari...,2020-10-21 22:35:08+00:00,"Legal Aid Commission, Office of the","• Salary package of up to $27,700 including study and book allowances for a 60-day placement",27700.0,,8.0,Sydney Region,101932003.0,Oceania,Australia,Sydney,Australia,Sydney,Sydney,Sydney,New South Wales,iworkfornsw,CC-MAIN-2020-45,,False,8.0,
73424,"Teacher Community Services, Orange","TAFE NSW Teacher – Community Services\n\nLocation: West Region, Orange\nPosition: Permanent Full...",https://iworkfor.nsw.gov.au/job/teacher-community-services-orange-212294,2020-10-21 21:43:25+00:00,TAFE NSW,"$110,799 Package includes salary ($84,664 - $100,407), employer's contribution to superannuation...",84664.0,100407.0,2000.0,Regional NSW / Bathurst & Central West NSW,102049305.0,Oceania,Australia,Bathurst Regional,Australia,,,,New South Wales,iworkfornsw,CC-MAIN-2020-45,,True,2000.0,84664.0


## Kaggle datasources

The commands ending in `_kaggle` require [Kaggle's API Credentials](https://github.com/Kaggle/kaggle-api#api-credentials) to be set up.
A mirror of the CC licenced data is provided without the `_kaggle` suffix.

### [Wine Reviews](https://www.kaggle.com/zynicide/wine-reviews)
CC BY-NC-SA 4.0

In [None]:
def data_wine_reviews_kaggle(data_path: Path = DATA_PATH, force: bool = False) -> pd.DataFrame:
    """Retrieves Wine Reivews Data
    
    Requires Kaggle Credentials
    
    License: CC BY-NC-SA 4.0
    See https://www.kaggle.com/zynicide/wine-reviews
    """
    return fetch_from_kaggle('zynicide/wine-reviews', 'winemag-data-130k-v2.csv', data_path=data_path)

In [None]:
#export
def data_wine_reviews(data_path: Path = DATA_PATH, force: bool = False) -> pd.DataFrame:
    """Retrieves Wine Reivews Data
    
    License: CC BY-NC-SA 4.0
    See https://www.kaggle.com/zynicide/wine-reviews
    """
    return fetch_dataset('winemag-data-130k-v2.csv.zip', data_path=data_path)

In [None]:
data_wine_reviews()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressi...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled o...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opu...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore),Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rus...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willamette Valley),Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,129966,Germany,Notes of honeysuckle and cantaloupe sweeten this deliciously feather-light spätlese. It's intens...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 Brauneberger Juffer-Sonnenuhr Spätlese Riesling (M...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,129967,US,"Citation is given as much as a decade of bottle age prior to release, which means it is pre-cell...",,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,129968,France,"Well-drained gravel soil gives this wine its crisp and dry character. It is ripe and fruity, alt...",Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Alsace),Gewürztraminer,Domaine Gresser
129969,129969,France,"A dry style of Pinot Gris, this is crisp with some acidity. It also has weight and a solid, powe...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


### [Movie Summaries from Wikipedia](https://www.kaggle.com/jrobischon/wikipedia-movie-plots)
CC BY-SA 4.0

In [None]:
def data_wiki_movies_kaggle(data_path: Path = DATA_PATH, force:bool = False) -> pd.DataFrame:
    """Movie Summaries from Wikipedia
    
    Requires Kaggle Credentials
    
    License: CC BY-SA 4.0
    See https://www.kaggle.com/jrobischon/wikipedia-movie-plots
    """
    return fetch_from_kaggle('jrobischon/wikipedia-movie-plots', 'wiki_movie_plots_deduped.csv', data_path=data_path, force=force)

In [None]:
#export
def data_wiki_movies(data_path: Path = DATA_PATH, force:bool = False) -> pd.DataFrame:
    """Movie Summaries from Wikipedia
    
    License: CC BY-SA 4.0
    See https://www.kaggle.com/jrobischon/wikipedia-movie-plots
    """
    return fetch_dataset('wiki_movie_plots_deduped.csv.zip', data_path=data_path, force=force)

In [None]:
data_wiki_movies()

wiki_movie_plots_deduped.csv.zip: 0.00B [00:00, ?B/s]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypicall..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a ..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the ba..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during wint..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow fo..."
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,"Director: Russell Crowe\n\nCast: Russell Crowe, Olga Kurylenko, Jai Courtney, Cem Yılmaz, Yılmaz...",unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War I has ended, and centres around Joshua Connor (Rus..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_%C3%87engi_%C4%B0kimiz,"Two musicians, Salih and Gürkan, described the adventures of their cousins."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü in a coastal village in Izmir, has just separated f..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fletcher and Ashley Clements",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable_(film),"The film centres around a young woman named Amy Tyler, who books a surprise holiday to Europe wi..."


### [Womens Clothing E-Commerce Reviews](https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews)
CC0

In [None]:
def data_women_clothing_reviews_kaggle(data_path: Path = DATA_PATH, force:bool = False) -> pd.DataFrame:
    """Women's E-commerce Clothing Reviews
    
    Requires Kaggle credentials
    
    License: CC0
    https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
    """
    return fetch_from_kaggle('nicapotato/womens-ecommerce-clothing-reviews', 'Womens Clothing E-Commerce Reviews.csv', data_path, force=force)

In [None]:
#export
def data_women_clothing_reviews(data_path: Path = DATA_PATH, force:bool = False) -> pd.DataFrame:
    """Women's E-commerce Clothing Reviews
    
    License: CC0
    https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
    """
    return fetch_dataset('Womens%20Clothing%20E-Commerce%20Reviews.csv.zip', data_path, force=force)

In [None]:
data_women_clothing_reviews()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comfortable,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,"Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i n...",5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and really wanted it to work for me. i initially ordered th...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get no...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to the adjustable front tie. it is the perfect length t...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...,...
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a great price! it's very easy to slip on and has a v...,5,1,0,General Petite,Dresses,Dresses
23482,23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stretchy, shiny material. cut is flattering and drapes...",3,1,0,General Petite,Tops,Knits
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see through. this never would have worked for me. i'm glad i...",3,0,1,General Petite,Dresses,Dresses
23484,23484,1084,28,"Very cute dress, perfect for summer parties and we","I bought this dress for a wedding i have this summer, and it's so cute. unfortunately the fit is...",3,1,2,General,Dresses,Dresses


### [Tweets from AU 2019 election](https://www.kaggle.com/taniaj/australian-election-2019-tweets)
CC0

In [None]:
def data_au_election_2019_tweets_kaggle(data_path: Path = DATA_PATH, force:bool = False) -> pd.DataFrame:
    """Tweets from the 2019 Australian Elections
    
    Collected from Twitter API keyword search betwen 2019-05-10 and 2019-05-20.
    
    Requires Kaggle credentials
    
    License: CC0: Public Domain
    See https://www.kaggle.com/taniaj/australian-election-2019-tweets
    """
    return fetch_from_kaggle('taniaj/australian-election-2019-tweets', 'auspol2019.csv', data_path, force=force)

In [None]:
#export
def data_au_election_2019_tweets(data_path: Path = DATA_PATH, force:bool = False) -> pd.DataFrame:
    """Tweets from the 2019 Australian Elections
    
    Collected from Twitter API keyword search betwen 2019-05-10 and 2019-05-20.
    
    License: CC0: Public Domain
    See https://www.kaggle.com/taniaj/australian-election-2019-tweets
    """
    return fetch_dataset('auspol2019.csv.zip', data_path, force=force)

In [None]:
data_au_election_2019_tweets()

Unnamed: 0,created_at,id,full_text,retweet_count,favorite_count,user_id,user_name,user_screen_name,user_description,user_location,user_created_at
0,2019-05-20 09:13:44,1130401208756187136,After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD,0.0,0.0,9.248486e+07,PIPELINEPETE,jocksjig,Retired Tradesman and Progressive Anti Conservative! Musician.,"Brisbane, Queensland",2009-11-25 09:19:45
1,2019-05-20 09:13:43,1130401205367140357,"@narendramodi @smritiirani Coverage of indian election on SBS tv channel, Australia. Jai hind 🇮🇳...",0.0,0.0,7.756474e+08,Narinder Parmar,nparmar1957,"Life coach & trainer, Motivational speaker, Mater NLP Practitioner, Author, Fellow of Institute ...","Wollongong, NSW, AUSTRALIA",2012-08-23 10:20:40
2,2019-05-20 09:13:33,1130401162782371841,@workmanalice Do you know if Facebook is releasing an election post-mortem in Australia? They lo...,0.0,0.0,5.687300e+04,Peter Wells,peterwells,Writes for @theage and @smh on technology and podcasts - works at Swinburne Uni as a Mac Admin -...,Melbourne,2006-12-11 07:38:06
3,2019-05-20 09:13:29,1130401143551434753,@vanbadham We all understand we have a compulsory preference system. Vote 1 mightn’t go to the m...,0.0,0.0,9.081660e+17,The Realist,therealist822,"Calls it as I see it. Anti PC, SJW and VS. If you want to be warm and fuzzy, grab a blanket and ...",,2017-09-14 03:10:30
4,2019-05-20 09:13:23,1130401118666809345,"Shares were mixed in Asia, with India and Australia leading gains for the region following elect...",0.0,0.0,5.260074e+08,Inquirer Business,InquirerBiz,The official Twitter account of the Inquirer Group's business news team.,Philippines,2012-03-16 03:51:59
...,...,...,...,...,...,...,...,...,...,...,...
183374,2019-05-11 03:19:57,1127050685621493760,#australiavotes; The BANKS are taking your money and giving you a pittance in return. The Reserv...,0.0,0.0,1.010011e+18,ivanparty.org,IvanpartyO,The IVAN PARTY is a new political party with a Royal solution to fix Australia and the United Ki...,,2018-06-22 04:07:26
183375,2019-05-11 02:50:09,1127043187292295169,Vote casted.... So no democracy sausage for early voters? LOL! 😂 🤣 🌭 #australiavotes #auspol2019,0.0,0.0,7.853301e+07,Ralph Michael,OrangeRafi,"Filipino living in Australia|IT Pro|Traveler|otaku|I like a cars, kpop, anime and everything abo...","Melbourne, Australia",2009-09-30 06:35:22
183376,2019-05-11 02:31:09,1127038404066045952,SINCERITY IS A #WINNING #SALES CHARACTERISTIC- #YOUR #PRIORITIES #ARE #CLEARLY #VISIBLE TO YOUR ...,0.0,0.0,1.009595e+18,LuceGluyas@gmail.com,GluyasLuce,,,2018-06-21 00:32:35
183377,2019-05-11 00:01:33,1127000757717303296,These arrived. No sign of a democracy sausage though. A true travesty if ever there was one. #au...,0.0,0.0,3.226870e+09,Sarah Hamlyn,in_deep_oceans,"Nature lover, terrible photographer, marine scientist. 🇦🇺 Adelaide uni alum. Staff biologist @Mo...","Key West, FL",2015-05-26 08:16:56


### Unimplemented

Other interesting datasets (To consider):

https://www.kaggle.com/andrewmvd/okcupid-profiles


CC0
tboyle10/medicaltranscriptions mtsamples.csv

https://www.kaggle.com/andrewmvd/okcupid-profiles/tboyle10/medicaltranscriptions


Enron Emails
https://www.cs.cmu.edu/~./enron/

https://www.kaggle.com/wcukierski/enron-email-dataset emails.csv

## Gutenberg

Texts from [Project Gutenberg](http://www.gutenberg.org/)

In [None]:
#export
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

In [None]:
#export
GUTENBERG_DICKENS = {
    98: 'A Tale of Two Cities',
    1400: 'Great Expectations',
    730: 'Oliver Twist',
    766: 'David Copperfield',
    19337: 'A Christmas Carol',
    786: 'Hard Times',
    1023: 'Bleak House',
    580: 'The Pickwick Papers',
    883: 'Out Mutual Friend',
    967: 'Nicholas Nickleby',
    700: 'The Old Curiosity Shop',
    821: 'Domeby and Son',
    963: 'Little Dorrit',
}

def data_dickens_corpus(data_path: Path = DATA_PATH, mirror:str="https://gutenberg.pglaf.org/") -> T.Dict[str, str]:
    """Download a corpus of Charles Dicken's most popular books
    
    data_path: Where to store the cache
    mirror   : Project Gutenberg mirror to use
    
    Returns a dictionary of {"title": "full text"}
    """
    dest = data_path / 'dickens.pkl'
    if not dest.exists():
        data = {title: strip_headers(load_etext(idx, mirror=mirror)).strip() for idx, title in GUTENBERG_DICKENS.items()}
        with open(dest, 'wb') as f:
            pickle.dump(data, f)
    with open(dest, 'rb') as f:
        data = pickle.load(f)
    return data

In [None]:
dickens = data_dickens_corpus()

This requires some more cleaning!

In [None]:
print(dickens['Great Expectations'][:1500])

[Illustration]




Great Expectations

[1867 Edition]

by Charles Dickens


Contents

 Chapter I.
 Chapter II.
 Chapter III.
 Chapter IV.
 Chapter V.
 Chapter VI.
 Chapter VII.
 Chapter VIII.
 Chapter IX.
 Chapter X.
 Chapter XI.
 Chapter XII.
 Chapter XIII.
 Chapter XIV.
 Chapter XV.
 Chapter XVI.
 Chapter XVII.
 Chapter XVIII.
 Chapter XIX.
 Chapter XX.
 Chapter XXI.
 Chapter XXII.
 Chapter XXIII.
 Chapter XXIV.
 Chapter XXV.
 Chapter XXVI.
 Chapter XXVII.
 Chapter XXVIII.
 Chapter XXIX.
 Chapter XXX.
 Chapter XXXI.
 Chapter XXXII.
 Chapter XXXIII.
 Chapter XXXIV.
 Chapter XXXV.
 Chapter XXXVI.
 Chapter XXXVII.
 Chapter XXXVIII.
 Chapter XXXIX.
 Chapter XL.
 Chapter XLI.
 Chapter XLII.
 Chapter XLIII.
 Chapter XLIV.
 Chapter XLV.
 Chapter XLVI.
 Chapter XLVII.
 Chapter XLVIII.
 Chapter XLIX.
 Chapter L.
 Chapter LI.
 Chapter LII.
 Chapter LIII.
 Chapter LIV.
 Chapter LV.
 Chapter LVI.
 Chapter LVII.
 Chapter LVIII.
 Chapter LIX.

[Illustration]




Chapter I.


My father’s family nam

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 000_data.ipynb.
Converted 00_core.ipynb.
Converted 01_segment.ipynb.
Converted 02_ngram.ipynb.
Converted index.ipynb.
