In [1]:
import pandas as pd
import numpy as np

## Adding Huffington Post News 2018 dataset (sampled) as Testing Data

The 2018 huffington post news was downloaded from kaggle via https://www.kaggle.com/rmisra/news-category-dataset.

In [2]:
huff_news_df = pd.read_json('../input/News_Category_Dataset_v2.json', lines=True)

In [3]:
huff_news_df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [4]:
huff_news_df.nunique()

category                 41
headline             199344
authors               27993
link                 200812
short_description    178353
date                   2309
dtype: int64

In [5]:
huff_news_df.category.value_counts().head(10)

POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
FOOD & DRINK       6226
BUSINESS           5937
Name: category, dtype: int64

In [6]:
sub_df = huff_news_df.sample(20000).reset_index().rename({'index': 'UID',
                                                          'headline': 'Text',
                                                          'category': 'Tags'}, axis=1)

In [7]:
sub_df.head()

Unnamed: 0,UID,Tags,Text,authors,link,short_description,date
0,177378,TECH,"'OpenDyslexic,' Free Font, Helps People With D...",,https://www.huffingtonpost.com/entry/open-dysl...,The BBC reports that a recent update to Instap...,2012-10-07
1,118823,GREEN,WATCH: This Manatee Rescue Will Make Your Day,Avery Stone,https://www.huffingtonpost.com/entry/florida-m...,,2014-06-29
2,114742,HEALTHY LIVING,Wheelchair Wisdom: A Short Course on Perceptio...,"Linda Noble Topf, ContributorInspirational Aut...",https://www.huffingtonpost.com/entry/wheelchai...,"Like any prejudice, our perceptions about what...",2014-08-14
3,110397,WEIRD NEWS,BALLSY MOVE: Man Pushes Giant Testicle Across US,David Moye,https://www.huffingtonpost.com/entry/thomas-ca...,,2014-10-03
4,13389,HEALTHY LIVING,Gender-Fluid Scouts Of America,"The Atlantic, ContributorExploring the America...",https://www.huffingtonpost.com/entry/gender-fl...,There are times when it is necessary to highli...,2017-10-24


In [8]:
huff_train_df = sub_df.iloc[:10000, :].reset_index(drop=True)
huff_test_df = sub_df.iloc[10000:, :].reset_index(drop=True)

In [9]:
huff_train_df.to_json('../input/huffpost_news_train_10000_coded.json', orient='records', indent=2)
huff_test_df.to_json('../input/huffpost_news_test_10000_coded.json', orient='records', indent=2)

## Adding Stack Overflow Python Questions dataset (sampled) as Testing Data

The stack overflow python questions dataset was downloaded from Kaggle via https://www.kaggle.com/stackoverflow/pythonquestions.

In [10]:
so_question_df = pd.read_csv('../input/Questions.csv', encoding= "ISO-8859-1", usecols=['Id', 'Title', 'Body'])

In [11]:
so_question_df.shape

(607282, 3)

In [12]:
# read all the tags
tags_df = pd.read_csv('../input/Tags.csv', encoding= "ISO-8859-1")

In [13]:
tags_df.Tag.value_counts().head(20)

python               607283
django                62818
python-2.7            34616
pandas                26854
python-3.x            26814
numpy                 25848
list                  18951
matplotlib            16521
regex                 14047
dictionary            13413
tkinter               10766
string                10616
flask                 10488
google-app-engine     10286
csv                    9323
arrays                 9170
json                   8023
mysql                  7529
linux                  7121
html                   7118
Name: Tag, dtype: int64

In [14]:
# subset tags
tags_sub_df = tags_df.loc[
    ~np.isin(tags_df.Tag, ['python', 'python-2.7', 'python-3.x']), :].reset_index(drop=True)

In [15]:
tags_sub_df.Tag.value_counts().head(10)

django        62818
pandas        26854
numpy         25848
list          18951
matplotlib    16521
regex         14047
dictionary    13413
tkinter       10766
string        10616
flask         10488
Name: Tag, dtype: int64

In [16]:
# create the top freq tags from the list, with more than 10% of the 60,000~ records
top_tags = tags_sub_df.Tag.value_counts()[tags_sub_df.Tag.value_counts() > 6000].index.tolist()
uid_w_top_tags = tags_sub_df.Id[tags_sub_df.Tag.isin(top_tags)].unique()
sampled_uid = uid_w_top_tags[np.random.choice(uid_w_top_tags.shape[0], size=20000, replace=False)]

In [17]:
so_sub_questions_df = so_question_df.loc[np.isin(so_question_df.Id, sampled_uid), :].reset_index(drop=True)
so_sub_questions_df.head()

Unnamed: 0,Id,Title,Body
0,20927,updating an auto_now DateTimeField in a parent...,<p>I've got two models: Message and Attachment...
1,47207,Django: Print url of view without hardcoding t...,<p>Can i print out a url <code>/admin/manage/p...
2,64362,How can I access App Engine through a Corporat...,<p>I have corporate proxy that supports https ...
3,75829,Best way to access table instances when using ...,<p>All the docs for SQLAlchemy give <code>INSE...
4,79968,Split a string by spaces -- preserving quoted ...,<p>I have a string which is like this:</p>\n\n...


In [18]:
# subset tags
top_tags_df = tags_df.loc[
    np.isin(tags_df.Id, so_sub_questions_df.Id.unique()) & \
    np.isin(tags_df.Tag, top_tags), :].reset_index(drop=True)
top_tags_df.head()

Unnamed: 0,Id,Tag
0,20927,django
1,47207,django
2,64362,google-app-engine
3,75829,sqlalchemy
4,79968,regex


In [19]:
top_tags_df.shape

(23243, 2)

In [20]:
# test with iterrows to step through the tags_df
next(top_tags_df.iterrows())[1]#['Id']

Id      20927
Tag    django
Name: 0, dtype: object

In [21]:
%%time
tags_dict = {}
for row in top_tags_df.iterrows():
    uid, q_tag = row[1]['Id'], row[1]['Tag']
    if uid not in tags_dict.keys():
        tags_dict[uid] = [q_tag]
    else:
        tags_dict[uid].append(q_tag)

CPU times: user 2.28 s, sys: 6.98 ms, total: 2.28 s
Wall time: 2.28 s


In [22]:
tags_pd_ready = {'Id':[], 'Tag':[]}
for Id, Tag in tags_dict.items():
    tags_pd_ready['Id'].append(Id)
    tags_pd_ready['Tag'].append(", ".join(Tag))
question_tags_df = pd.DataFrame(tags_pd_ready)

In [23]:
question_tags_df.head()

Unnamed: 0,Id,Tag
0,20927,django
1,47207,django
2,64362,google-app-engine
3,75829,sqlalchemy
4,79968,regex


In [24]:
# merge the tables into a single set
so_sub_df = so_sub_questions_df.merge(question_tags_df, how='inner', on='Id').rename({'Title': 'Text', 'Id': 'UID', 'Tag': 'Tags'}, axis=1)

In [28]:
so_sub_df.head()

Unnamed: 0,UID,Text,Body,Tags
0,20927,updating an auto_now DateTimeField in a parent...,<p>I've got two models: Message and Attachment...,django
1,47207,Django: Print url of view without hardcoding t...,<p>Can i print out a url <code>/admin/manage/p...,django
2,64362,How can I access App Engine through a Corporat...,<p>I have corporate proxy that supports https ...,google-app-engine
3,75829,Best way to access table instances when using ...,<p>All the docs for SQLAlchemy give <code>INSE...,sqlalchemy
4,79968,Split a string by spaces -- preserving quoted ...,<p>I have a string which is like this:</p>\n\n...,regex


In [25]:
stack_overflow_train_df = so_sub_df.iloc[:10000, :].reset_index(drop=True)
stack_overflow_test_df = so_sub_df.iloc[10000:, :].reset_index(drop=True)

In [26]:
stack_overflow_train_df.to_json('../input/stack_overflow_train_10000_coded.json', orient='records', indent=2)
stack_overflow_test_df.to_json('../input/stack_overflow_test_10000_coded.json', orient='records', indent=2)