# Transform the data


In [133]:
import pickle
import pandas as pd
import numpy as np
from IPython.display import display, HTML, Markdown, clear_output
import ipywidgets as widgets
import time 

In [2]:
with open("reddit_askdocs_submissions_2017.pkl", "rb") as f:
    d_2017 = pickle.load(f)

In [134]:
df_2017 = pd.DataFrame(d_2017)

In [45]:
df_2017.head()

Unnamed: 0,author,author_flair_css_class,author_flair_text,brand_safe,can_mod_post,contest_mode,created_utc,domain,full_link,id,...,approved_at_utc,banned_at_utc,view_count,gilded,media_embed,secure_media_embed,author_created_utc,author_fullname,media,secure_media
0,[deleted],,,True,False,False,1514764452,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbw...,7nbwtn,...,,,,,,,,,,
1,XenonCSGO,default,This user has not yet been verified.,True,False,False,1514764122,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbv...,7nbvsv,...,,,,,,,,,,
2,[deleted],,,True,False,False,1514764055,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbv...,7nbvln,...,,,,,,,,,,
3,DavisTheMagicSheep,default,This user has not yet been verified.,True,False,False,1514763799,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbu...,7nburb,...,,,,,,,,,,
4,Dontgetscooped,default,This user has not yet been verified.,True,False,False,1514763188,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbs...,7nbsw2,...,,,,,,,,,,


In [7]:
df_2017['author_flair_text'].value_counts(dropna=False, normalize=True)

This user has not yet been verified.    0.694449
NaN                                     0.305007
Medical Student                         0.000144
Nursing Student                         0.000112
Registered Nurse                        0.000048
Pharmacist                              0.000032
EMT                                     0.000016
Physician Assistant                     0.000016
Pharm.D. Student                        0.000016
Moderator                               0.000016
Physician                               0.000016
Biomedical Student                      0.000016
Web Developer                           0.000016
B.S., Medical Lab Sciences              0.000016
Nursing Graduate, RPN                   0.000016
Physician | Moderator                   0.000016
Psychologist                            0.000016
WEB DEVELOPER                           0.000016
Lead Moderator                          0.000016
Name: author_flair_text, dtype: float64

## Filter rows  

Filtering out submissions not relevant to this analysis, like mod posts and banned posts. We want to analyze submissions that are medical questions from users. Mod posts are typically announcements, and bannded posts were deemed unrelated or inappropriate by subreddit moderators, so those types of posts can be filtered out as not relevant to analysis.

In [9]:
fname = 'reddit_submissions_selected_fields.csv'
selected_fields_df = pd.read_csv(fname)

In [15]:
display(Markdown(selected_fields_df[selected_fields_df['analysis_role'] == 'f'].to_markdown()))

|    | field_name    | type   | reddit_role   | analysis_role   | notes                                                                                                                     | type_long   | reddit_role_long                                                | analysis_role_long   |
|---:|:--------------|:-------|:--------------|:----------------|:--------------------------------------------------------------------------------------------------------------------------|:------------|:----------------------------------------------------------------|:---------------------|
| 14 | stickied      | b      | aa            | f               | Mods can pin up to 2 of their own posts to the top of the subreddit. This tag used to be called announcements.            | binary flag | author actions on the post (other than commenting)              | filtering            |
| 17 | banned_by     | c      | ma            | f               | The only values are NaN and moderators.                                                                                   | categorical | mod reactions to either post content or comments activity on it | filtering            |
| 21 | distinguished | c      | aa            | f               | Mods can tag posts as distinguished, usually used for subreddit management. Use this field for filtering out these posts. | categorical | author actions on the post (other than commenting)              | filtering            |

In [18]:
filter_fields = selected_fields_df[selected_fields_df['analysis_role'] == 'f']['field_name'].to_list()
filter_fields

['stickied', 'banned_by', 'distinguished']

In [19]:
df_2017['stickied'].value_counts(dropna=False)

False    62433
True         5
Name: stickied, dtype: int64

In [24]:
df_2017.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'brand_safe',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'id', 'is_crosspostable', 'is_reddit_media_domain', 'is_self',
       'is_video', 'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'created', 'post_hint', 'preview', 'banned_by', 'edited',
       'crosspost_parent', 'crosspost_parent_list', 'thumbnail_height',
       'thumbnail_width', 'author_cakeday', 'distinguished', 'removal_reason',
       'suggested_sort', 'approved_at_utc', 'banned_at_utc', 'view_count',
       'gilded', 'media_embed', 'secure_media_embed', 'author_created_utc',
       'author_fullname', 'media', 'secure_media'],
      dtype='object')

In [28]:
for i,r in df_2017[df_2017['stickied'] == True].iterrows():
    print('====')
    print(r['author_flair_text'])
    print(pd.to_datetime(r['created'], unit='s'))
    print(r['full_link'])
    print(r['title'])
    print(r['selftext'])

====
WEB DEVELOPER
2017-12-29 10:30:05
https://www.reddit.com/r/AskDocs/comments/7mream/please_do_not_downvote_submissions_if_a_question/
Please do not downvote submissions! If a question is not appropriate, please 'report' it to bring it to our attention.

====
Physician | Moderator
2017-11-22 21:17:32
https://www.reddit.com/r/AskDocs/comments/7equza/help_save_net_neutrality_so_we_can_keep_doing_our/
Help Save Net Neutrality so we can keep doing our work at /r/AskDocs!

====
This user has not yet been verified.
2017-10-11 07:30:37
https://www.reddit.com/r/AskDocs/comments/75l2qc/thank_you/
Thank You
I just wanted to post a quick thank you.

A few days ago I asked for advice for my 4 year old and ended up going in and finding an infection.  She’s on the mend now and I needed that advice for my 2 year old and took her in this morning after hearing her gasp for breath after a coughing fit last night.  She had a very minor fever and no other obvious signs to me but an X-ray later we found

^ In this data sample, of the 5 stickied submissions, 4 were announcements and 1 was a thank-you.

In [31]:
df_2017['distinguished'].value_counts(dropna=False)

NaN          62434
moderator        4
Name: distinguished, dtype: int64

In [32]:
for i,r in df_2017[df_2017['distinguished'] == 'moderator'].iterrows():
    print('====')
    print(r['author_flair_text'])
    print(pd.to_datetime(r['created'], unit='s'))
    print(r['full_link'])
    print(r['title'])
    print(r['selftext'])

====
WEB DEVELOPER
2017-12-29 10:30:05
https://www.reddit.com/r/AskDocs/comments/7mream/please_do_not_downvote_submissions_if_a_question/
Please do not downvote submissions! If a question is not appropriate, please 'report' it to bring it to our attention.

====
Physician | Moderator
2017-11-22 21:17:32
https://www.reddit.com/r/AskDocs/comments/7equza/help_save_net_neutrality_so_we_can_keep_doing_our/
Help Save Net Neutrality so we can keep doing our work at /r/AskDocs!

====
Web Developer
2017-07-15 19:10:47
https://www.reddit.com/r/AskDocs/comments/6nfekv/in_case_it_wasnt_obvious_abusive_language_will/
In case it wasn't obvious, abusive language will not be tolerated in this sub.
In light recent events, please be advised that any form of bullying or abusive language towards anybody is unacceptable, and will result in a permanent ban from the sub.

If you notice anything, please hit the report button.

We appreciate your cooperation in keeping this sub a safe place for everyone!
====


In [33]:
df_2017['banned_by'].value_counts(dropna=False)

NaN           59411
moderators     3027
Name: banned_by, dtype: int64

In [34]:
for i,r in df_2017[df_2017['banned_by'] == 'moderators'].head().iterrows():
    print('====')
    print(r['author_flair_text'])
    print(pd.to_datetime(r['created'], unit='s'))
    print(r['full_link'])
    print(r['title'])
    print(r['selftext'])

====
None
2018-01-01 05:33:25
https://www.reddit.com/r/AskDocs/comments/7nb5nw/redness_but_not_jockitch/
Redness but not JockItch?
nan
====
None
2018-01-01 04:39:41
https://www.reddit.com/r/AskDocs/comments/7nauj8/epinephrine_and_hydroxyzine_eli5/
Epinephrine and Hydroxyzine ELI5
nan
====
None
2018-01-01 02:39:59
https://www.reddit.com/r/AskDocs/comments/7na56p/what_is_the_maximum_healthy_range_for_an_lppla2/
What is the maximum healthy range for an Lp-PLA2 test?
nan
====
None
2018-01-01 02:36:19
https://www.reddit.com/r/AskDocs/comments/7na4ei/nsfw_possible_std/
(Nsfw) possible STD
nan
====
None
2018-01-01 02:25:16
https://www.reddit.com/r/AskDocs/comments/7na221/clubbed_nails_update_do_i_have_it/
Clubbed nails update? Do I have it?
nan


In [55]:
def filter_submissions(df):
    original_rows_count = len(df)
    print(f'Original rows count: {original_rows_count}')
    
    filters = (df['stickied'] == True) | (df['distinguished'] == 'moderator') \
        | (df['banned_by'] == 'moderators')
    df = df.drop(df[filters].index)
    
    print(f'Filtered rows count: {len(df)}')
    print(f'% of rows filtered out: {(1-len(df)/original_rows_count)*100:.2f}')
    
    return df

In [135]:
df_2017 = filter_submissions(df_2017)

Original rows count: 62438
Filtered rows count: 59406
% of rows filtered out: 4.86


In [48]:
display(Markdown(selected_fields_df[selected_fields_df['analysis_role'] != 'f'].to_markdown()))

|    | field_name            | type   | reddit_role   | analysis_role   | notes                                                                                                                                                                                                                   | type_long   | reddit_role_long                                                | analysis_role_long   |
|---:|:----------------------|:-------|:--------------|:----------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------|:----------------------------------------------------------------|:---------------------|
|  0 | author                | s      | a             | i               | Note the [deleted] and [removed] entries.                                                                                                                                                                               | short text  | author info                                                     | id                   |
|  1 | author_flair_text     | c      | a             | a               | 31% NaN values.                                                                                                                                                                                                         | categorical | author info                                                     | analysis             |
|  2 | created_utc           | t      | p             | a               | Dups present.                                                                                                                                                                                                           | timestamp   | post details                                                    | analysis             |
|  3 | domain                | s      | p             | a               | Domain where the post originated from.                                                                                                                                                                                  | short text  | post details                                                    | analysis             |
|  4 | full_link             | u      | p             | r               | A link to the post on Reddit.                                                                                                                                                                                           | url         | post details                                                    | reference            |
|  5 | id                    | s      | p             | i               | Post id                                                                                                                                                                                                                 | short text  | post details                                                    | id                   |
|  6 | locked                | b      | ma            | a               | Only 6 values are true, everything else false.                                                                                                                                                                          | binary flag | mod reactions to either post content or comments activity on it | analysis             |
|  7 | num_comments          | n      | ga            | a               | nan                                                                                                                                                                                                                     | numeric     | general subreddit users reactions to the post                   | analysis             |
|  8 | num_crossposts        | n      | aa            | a               | Both NaN and zeros present. Few values >0.                                                                                                                                                                              | numeric     | author actions on the post (other than commenting)              | analysis             |
|  9 | over_18               | b      | p             | a               | 98% false. Looks like a NSFW-type label on the post content.                                                                                                                                                            | binary flag | post details                                                    | analysis             |
| 10 | pinned                | b      | aa            | a               | Users can pin up to 4 posts to their profile.                                                                                                                                                                           | binary flag | author actions on the post (other than commenting)              | analysis             |
| 11 | score                 | n      | ga            | a               | The score is based on up and down votes.                                                                                                                                                                                | numeric     | general subreddit users reactions to the post                   | analysis             |
| 12 | selftext              | l      | p             | a               | Can have [deleted] as values.                                                                                                                                                                                           | long text   | post details                                                    | analysis             |
| 13 | spoiler               | b      | aa            | a               | Spoiler tags are used to mark spoiler content, and they can blur the preview or thumbnails. Both mods and post authors can add a spoiler tag on a post. There were 30 true values in the sample, so decided to keep it. | binary flag | author actions on the post (other than commenting)              | analysis             |
| 15 | title                 | l      | p             | a               | Title of the post, can be very long.                                                                                                                                                                                    | long text   | post details                                                    | analysis             |
| 16 | url                   | u      | p             | r               | Url to the original post if crossposted or from other source.                                                                                                                                                           | url         | post details                                                    | reference            |
| 18 | edited                | t      | aa            | a               | 86% NaNs.                                                                                                                                                                                                               | timestamp   | author actions on the post (other than commenting)              | analysis             |
| 19 | crosspost_parent      | s      | p             | t               | Cross-post parent post id.                                                                                                                                                                                              | short text  | post details                                                    | transform            |
| 20 | crosspost_parent_list | l      | p             | t               | This ultimately contains the body text of a crossposted post. Just have to pull if out of the list of dicts.                                                                                                            | long text   | post details                                                    | transform            |
| 22 | author_fullname       | s      | a             | i               | Unclear what this is, and lots of NaNs, but decided to keep for now.                                                                                                                                                    | short text  | author info                                                     | id                   |

## Next step: transform to get the post body  
using these fields: crosspost_parent, crosspost_parent_list

In [61]:
df_2017['crosspost_parent'].value_counts(dropna=False).head()

NaN          59386
t3_7j3ns3        1
t3_7dj18d        1
t3_7eea0a        1
t3_7g8asa        1
Name: crosspost_parent, dtype: int64

In [59]:
df_2017['crosspost_parent'].isna().value_counts(dropna=False)

True     59386
False       20
Name: crosspost_parent, dtype: int64

In [64]:
df_2017['crosspost_parent_list'].isna().value_counts(dropna=False).head()

True     59386
False       20
Name: crosspost_parent_list, dtype: int64

In [69]:
df_2017.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'brand_safe',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'id', 'is_crosspostable', 'is_reddit_media_domain', 'is_self',
       'is_video', 'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'created', 'post_hint', 'preview', 'banned_by', 'edited',
       'crosspost_parent', 'crosspost_parent_list', 'thumbnail_height',
       'thumbnail_width', 'author_cakeday', 'distinguished', 'removal_reason',
       'suggested_sort', 'approved_at_utc', 'banned_at_utc', 'view_count',
       'gilded', 'media_embed', 'secure_media_embed', 'author_created_utc',
       'author_fullname', 'media', 'secure_media'],
      dtype='object')

In [71]:
df_2017[df_2017['crosspost_parent_list'].notna()][[
    'crosspost_parent', 
    'crosspost_parent_list',
    'selftext'
]]

Unnamed: 0,crosspost_parent,crosspost_parent_list,selftext
46,t3_7n9uyb,"[{'approved_at_utc': None, 'approved_by': None...",
247,t3_7n24f7,"[{'approved_at_utc': None, 'approved_by': None...",
553,t3_7mhq3z,"[{'approved_at_utc': None, 'approved_by': None...",
1221,t3_7lxy1e,"[{'approved_at_utc': None, 'approved_by': None...",
2227,t3_7kofhb,"[{'approved_at_utc': None, 'approved_by': None...",
2403,t3_7kfcm4,"[{'approved_at_utc': None, 'approved_by': None...",
2758,t3_7jvfqf,"[{'approved_at_utc': None, 'approved_by': None...",
2849,t3_7jus0u,"[{'approved_at_utc': None, 'approved_by': None...",
3084,t3_7jjypl,"[{'approved_at_utc': None, 'approved_by': None...",
3156,t3_7j68ls,"[{'approved_at_utc': None, 'approved_by': None...",


If it's a crosspost, then the submission body is either blank or contains something like '[deleted]'. If not a crosspost, then the `crosspost_parent_list` field in NaN.

In [75]:
test.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'brand_safe',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'id', 'is_crosspostable', 'is_reddit_media_domain', 'is_self',
       'is_video', 'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'created', 'post_hint', 'preview', 'banned_by', 'edited',
       'crosspost_parent', 'crosspost_parent_list', 'thumbnail_height',
       'thumbnail_width', 'author_cakeday', 'distinguished', 'removal_reason',
       'suggested_sort', 'approved_at_utc', 'banned_at_utc', 'view_count',
       'gilded', 'media_embed', 'secure_media_embed', 'author_created_utc',
       'author_fullname', 'media', 'secure_media'],
      dtype='object')

In [86]:
test = df_2017[df_2017['crosspost_parent_list'].notna()].head(2)

for i,r in test.iterrows():
    print('=====')
    print(len(r['crosspost_parent_list'])) # arrays of legth 1 in data sample
    print(r['crosspost_parent_list'][0].keys())

=====
1
dict_keys(['approved_at_utc', 'approved_by', 'archived', 'author', 'author_flair_css_class', 'author_flair_text', 'banned_at_utc', 'banned_by', 'brand_safe', 'can_gild', 'can_mod_post', 'clicked', 'contest_mode', 'created', 'created_utc', 'distinguished', 'domain', 'downs', 'edited', 'gilded', 'hidden', 'hide_score', 'id', 'is_crosspostable', 'is_reddit_media_domain', 'is_self', 'is_video', 'likes', 'link_flair_css_class', 'link_flair_text', 'locked', 'media', 'media_embed', 'mod_note', 'mod_reason_by', 'mod_reason_title', 'mod_reports', 'name', 'num_comments', 'num_crossposts', 'num_reports', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'quarantine', 'removal_reason', 'report_reasons', 'saved', 'score', 'secure_media', 'secure_media_embed', 'selftext', 'selftext_html', 'spoiler', 'stickied', 'subreddit', 'subreddit_id', 'subreddit_name_prefixed', 'subreddit_type', 'suggested_sort', 'thumbnail', 'thumbnail_height', 'thumbnail_width', 'title', 'ups', 'url', 'user

In [104]:
test = df_2017[df_2017['crosspost_parent_list'].notna()].head()

for i,r in test.iterrows():
    print('\n=====')
    print(r['num_crossposts'])
    print(r['is_self'])
    print(len(r['crosspost_parent_list'])) # arrays of legth 1 in data sample
    print(f"Subreddit: {r['crosspost_parent_list'][0]['subreddit']}")
    print(f"{r['crosspost_parent_list'][0]['selftext'][:400]}...")


=====
0.0
False
1
Subreddit: medical
**If you have any ideas on how to fix this or have similar issues, *please* take the time to read and let me know. I'm suffering and I honestly feel like dying.**
----------

I'm a teen male. 

**Conditions that I already have that might affect this** : I have dust and fur and plant allergies, and had food allergies as a kid but they have cleared, it seems. I have asthma as well. In 2015, I broke ...

=====
0.0
False
1
Subreddit: DiagnoseMe
Hey guys.

Yesterday and today I had a massive headache at the point of orgasm that goes from the back of my head to the top, mainly on the left side. The severity fades quickly after about a minute, but the headache still lingers on the top left side of my head for hours. I've never experienced this before since yesterday.

What could this be? Is this something I should be worried about?...

=====
0.0
False
1
Subreddit: STD
Hello! I've always thought I've had pimples on my butt for a few years (I'm 19 now, livi

We get the submission body and the crosspost subreddit from the `crosspost_parent_list` field, all other relevant fields are duplicates of the submission-level fields. 

In [107]:
test = df_2017[df_2017['crosspost_parent_list'].notna()].copy()
test['selftext']

46               
247              
553              
1221             
2227             
2403             
2758             
2849             
3084             
3156             
3359    [deleted]
3567             
3618             
3901    [deleted]
5126             
5307             
5431             
6736             
7297    [deleted]
7543             
Name: selftext, dtype: object

In [128]:
f = [1, 2]#float('nan')
print(type(f))
if type(f) == list:
    print('test')

<class 'list'>
test


In [136]:
np.nan

nan

In [139]:
def get_crosspost_body(lst):
    # Assuming all crossposts will have same body (perhaps they could be edited later on though)
    if type(lst) == list:
        return lst[0]['selftext']
    return np.nan

In [140]:
#test['selftext'] = test['crosspost_parent_list'].apply(get_crosspost_body)
test['selftext'] = np.where(
    test['crosspost_parent_list'].notna(), 
    test['crosspost_parent_list'].apply(get_crosspost_body), 
    test['selftext']
)
# test['selftext'] = test['crosspost_parent_list'].apply(lambda x: x[0]['selftext'] 
#                                                        if type(x) == list else np.nan)

In [141]:
test['selftext']

46      **If you have any ideas on how to fix this or ...
247     Hey guys.\n\nYesterday and today I had a massi...
553     Hello! I've always thought I've had pimples on...
1221    My mother is around 50 years old. So we've bee...
2227    Sorry for the low quality, it was hard to take...
2403    Last month, I had a visit to the doctor where ...
2758    (EDIT: Replace "amblyopia" with "strabismus," ...
2849    Hello, first--sorry for the length.\n\nHere's ...
3084    Last night I fell while snowboarding, like an ...
3156    I have been struggling for so long. A few mont...
3359                                            [deleted]
3567    Throw away account. I am posting this on other...
3618    I realize how rare this is but bear with me. L...
3901    Is it possible to have sleep apnea with out sn...
5126    I have read that certain malignancies or blood...
5307    **My short bio:** I'm an associate professor o...
5431      Not sure where to turn for this, my entire l...
6736    Where 

In [142]:
#df_2017['selftext'] = df_2017['crosspost_parent_list'].apply(get_crosspost_body)

df_2017['selftext'] = np.where(
    df_2017['crosspost_parent_list'].notna(), 
    df_2017['crosspost_parent_list'].apply(get_crosspost_body), 
    df_2017['selftext']
)

In [144]:
df_2017[df_2017['crosspost_parent_list'].notna()]['selftext']

46      **If you have any ideas on how to fix this or ...
247     Hey guys.\n\nYesterday and today I had a massi...
553     Hello! I've always thought I've had pimples on...
1221    My mother is around 50 years old. So we've bee...
2227    Sorry for the low quality, it was hard to take...
2403    Last month, I had a visit to the doctor where ...
2758    (EDIT: Replace "amblyopia" with "strabismus," ...
2849    Hello, first--sorry for the length.\n\nHere's ...
3084    Last night I fell while snowboarding, like an ...
3156    I have been struggling for so long. A few mont...
3359                                            [deleted]
3567    Throw away account. I am posting this on other...
3618    I realize how rare this is but bear with me. L...
3901    Is it possible to have sleep apnea with out sn...
5126    I have read that certain malignancies or blood...
5307    **My short bio:** I'm an associate professor o...
5431      Not sure where to turn for this, my entire l...
6736    Where 

Note: will need to filter out submissions with '[deleted]' body.

In [145]:
def get_crosspost_subreddits(lst):
    if type(lst) == list:
        subreddits_list = []
        for i in lst:
            sr = i['subreddit']
            subreddits_list.append(sr)
            
        return subreddits_list
    
    return np.nan

In [146]:
test['crosspost_subreddits'] = test['crosspost_parent_list'].apply(get_crosspost_subreddits)

In [147]:
test['crosspost_subreddits']

46               [medical]
247           [DiagnoseMe]
553                  [STD]
1221             [medical]
2227         [Dermatology]
2403             [medical]
2758           [optometry]
2849         [Dermatology]
3084           [AskDoctor]
3156               [MTHFR]
3359            [RedditMD]
3567                 [sex]
3618           [optometry]
3901          [SleepApnea]
5126    [Medical_Students]
5307                [IAmA]
5431          [needadvice]
6736           [Allergies]
7297               [Drugs]
7543     [SkincareAddicts]
Name: crosspost_subreddits, dtype: object

In [148]:
df_2017['crosspost_subreddits'] = df_2017['crosspost_parent_list'].apply(get_crosspost_subreddits)

In [150]:
df_2017[df_2017['crosspost_parent_list'].notna()]['crosspost_subreddits']

46               [medical]
247           [DiagnoseMe]
553                  [STD]
1221             [medical]
2227         [Dermatology]
2403             [medical]
2758           [optometry]
2849         [Dermatology]
3084           [AskDoctor]
3156               [MTHFR]
3359            [RedditMD]
3567                 [sex]
3618           [optometry]
3901          [SleepApnea]
5126    [Medical_Students]
5307                [IAmA]
5431          [needadvice]
6736           [Allergies]
7297               [Drugs]
7543     [SkincareAddicts]
Name: crosspost_subreddits, dtype: object

## Next step: filter any submissions with [deleted] in body, and drop columns that are not relevant anymore (the ones used for filtering and transform probably won't be needed beyond this point)

In [159]:
df_2017.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'brand_safe',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'id', 'is_crosspostable', 'is_reddit_media_domain', 'is_self',
       'is_video', 'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'created', 'post_hint', 'preview', 'banned_by', 'edited',
       'crosspost_parent', 'crosspost_parent_list', 'thumbnail_height',
       'thumbnail_width', 'author_cakeday', 'distinguished', 'removal_reason',
       'suggested_sort', 'approved_at_utc', 'banned_at_utc', 'view_count',
       'gilded', 'media_embed', 'secure_media_embed', 'author_created_utc',
       'author_fullname', 'media', 'secure_media', 'crosspost_subreddits'],
      dtype='object')

In [190]:
df_2017['pinned'].value_counts(dropna=False)

NaN      43562
False    15844
Name: pinned, dtype: int64

In [183]:
df_2017['is_crosspostable'].value_counts(dropna=False)

NaN      39730
True     14372
False     5304
Name: is_crosspostable, dtype: int64

In [182]:
for i, r in df_2017[df_2017['num_crossposts'] > 0].iterrows():
    print('====')
    print(r['full_link'])
    print(r['is_crosspostable'])

====
https://www.reddit.com/r/AskDocs/comments/7nasux/i_have_a_myofibroma_now_what_do_i_do/
True
====
https://www.reddit.com/r/AskDocs/comments/7m3ovt/infectious_diseasemicrobiology_my_dads_culture/
True
====
https://www.reddit.com/r/AskDocs/comments/7lsyme/got_wisdom_teeth_out_monday_morning_still_crying/
True
====
https://www.reddit.com/r/AskDocs/comments/7lpwt4/which_do_you_think_is_the_more_likely_reason_for/
True
====
https://www.reddit.com/r/AskDocs/comments/7hksyp/fracture_at_base_of_proximal_phalanx_big_toe/
True
====
https://www.reddit.com/r/AskDocs/comments/7fgfxf/neurosurgery_is_cranioplasty_necessary_73m/
True
====
https://www.reddit.com/r/AskDocs/comments/7f3ixe/26m_blackgrey_spot_on_gum_behind_back_lower_molar/
True
====
https://www.reddit.com/r/AskDocs/comments/7er16w/is_this_a_bunion/
True
====
https://www.reddit.com/r/AskDocs/comments/7cofno/bulging_disc_need_help_with_lower_lumbar_spine/
True
====
https://www.reddit.com/r/AskDocs/comments/7co46z/ibuprofen_drowziness/


In [176]:
df_2017[df_2017['crosspost_parent_list'].isna()]['num_crossposts'].value_counts(dropna=False)

NaN    39730
0.0    19644
1.0       11
2.0        1
Name: num_crossposts, dtype: int64

In [174]:
df_2017[df_2017['crosspost_parent_list'].notna()]['domain'].value_counts(dropna=False)

self.medical             3
self.Dermatology         2
self.optometry           2
self.DiagnoseMe          1
self.STD                 1
self.AskDoctor           1
self.MTHFR               1
self.RedditMD            1
self.sex                 1
self.SleepApnea          1
self.Medical_Students    1
self.IAmA                1
self.needadvice          1
self.Allergies           1
self.Drugs               1
self.SkincareAddicts     1
Name: domain, dtype: int64

In [168]:
pd.to_datetime(df_2017['author_created_utc'], unit='s').value_counts(dropna=False)

NaT                    36027
2017-02-07 23:14:23       50
2016-12-08 15:55:02       41
2016-06-25 20:07:05       27
2016-12-14 00:03:03       24
                       ...  
2017-02-07 23:43:33        1
2017-05-08 18:34:29        1
2017-02-27 15:51:00        1
2015-05-23 16:50:26        1
2016-06-23 23:58:10        1
Name: author_created_utc, Length: 18243, dtype: int64

In [189]:
display(Markdown(selected_fields_df[~selected_fields_df['analysis_role'].isin(['f', 't'])].to_markdown()))

|    | field_name        | type   | reddit_role   | analysis_role   | notes                                                                                                                                                                                                                   | type_long   | reddit_role_long                                                | analysis_role_long   |
|---:|:------------------|:-------|:--------------|:----------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------|:----------------------------------------------------------------|:---------------------|
|  0 | author            | s      | a             | i               | Note the [deleted] and [removed] entries.                                                                                                                                                                               | short text  | author info                                                     | id                   |
|  1 | author_flair_text | c      | a             | a               | 31% NaN values.                                                                                                                                                                                                         | categorical | author info                                                     | analysis             |
|  2 | created_utc       | t      | p             | a               | Dups present.                                                                                                                                                                                                           | timestamp   | post details                                                    | analysis             |
|  3 | domain            | s      | p             | a               | Domain where the post originated from.                                                                                                                                                                                  | short text  | post details                                                    | analysis             |
|  4 | full_link         | u      | p             | r               | A link to the post on Reddit.                                                                                                                                                                                           | url         | post details                                                    | reference            |
|  5 | id                | s      | p             | i               | Post id                                                                                                                                                                                                                 | short text  | post details                                                    | id                   |
|  6 | locked            | b      | ma            | a               | Only 6 values are true, everything else false.                                                                                                                                                                          | binary flag | mod reactions to either post content or comments activity on it | analysis             |
|  7 | num_comments      | n      | ga            | a               | nan                                                                                                                                                                                                                     | numeric     | general subreddit users reactions to the post                   | analysis             |
|  8 | num_crossposts    | n      | aa            | a               | Both NaN and zeros present. Few values >0.                                                                                                                                                                              | numeric     | author actions on the post (other than commenting)              | analysis             |
|  9 | over_18           | b      | p             | a               | 98% false. Looks like a NSFW-type label on the post content.                                                                                                                                                            | binary flag | post details                                                    | analysis             |
| 10 | pinned            | b      | aa            | a               | Users can pin up to 4 posts to their profile.                                                                                                                                                                           | binary flag | author actions on the post (other than commenting)              | analysis             |
| 11 | score             | n      | ga            | a               | The score is based on up and down votes.                                                                                                                                                                                | numeric     | general subreddit users reactions to the post                   | analysis             |
| 12 | selftext          | l      | p             | a               | Can have [deleted] as values.                                                                                                                                                                                           | long text   | post details                                                    | analysis             |
| 13 | spoiler           | b      | aa            | a               | Spoiler tags are used to mark spoiler content, and they can blur the preview or thumbnails. Both mods and post authors can add a spoiler tag on a post. There were 30 true values in the sample, so decided to keep it. | binary flag | author actions on the post (other than commenting)              | analysis             |
| 15 | title             | l      | p             | a               | Title of the post, can be very long.                                                                                                                                                                                    | long text   | post details                                                    | analysis             |
| 16 | url               | u      | p             | r               | Url to the original post if crossposted or from other source.                                                                                                                                                           | url         | post details                                                    | reference            |
| 18 | edited            | t      | aa            | a               | 86% NaNs.                                                                                                                                                                                                               | timestamp   | author actions on the post (other than commenting)              | analysis             |
| 22 | author_fullname   | s      | a             | i               | Unclear what this is, and lots of NaNs, but decided to keep for now.                                                                                                                                                    | short text  | author info                                                     | id                   |

In [197]:
list(selected_fields_df[~selected_fields_df['analysis_role'].isin(['f', 't'])]['field_name'])

['author',
 'author_flair_text',
 'created_utc',
 'domain',
 'full_link',
 'id',
 'locked',
 'num_comments',
 'num_crossposts',
 'over_18',
 'pinned',
 'score',
 'selftext',
 'spoiler',
 'title',
 'url',
 'edited',
 'author_fullname']

In [198]:
# Final fields selection

selected_fields = [
    'author',
    'author_flair_text',
    'created_utc',
    'domain',
    'full_link',
    'id',
    'locked',
    'num_comments',
    'num_crossposts',
    'over_18',
#    'pinned',
    'score',
    'selftext', # edited to include the body of crossposted posts
#    'spoiler',
    'title',
    'url',
    'edited',
#    'author_fullname',
    
    'crosspost_subreddits' # created field
]

In [200]:
df_2017[selected_fields].head()

Unnamed: 0,author,author_flair_text,created_utc,domain,full_link,id,locked,num_comments,num_crossposts,over_18,score,selftext,title,url,edited,crosspost_subreddits
0,[deleted],,1514764452,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbw...,7nbwtn,False,0,0.0,False,2,[deleted],Appendicitis removed 1 month ago but feel a pa...,https://www.reddit.com/r/AskDocs/comments/7nbw...,,
1,XenonCSGO,This user has not yet been verified.,1514764122,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbv...,7nbvsv,False,2,0.0,False,1,"So I've taken Pepcid AC, 10mg at morning and n...","1 Stopped taking Pepcid AC, now suffering symp...",https://www.reddit.com/r/AskDocs/comments/7nbv...,,
2,[deleted],,1514764055,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbv...,7nbvln,False,1,0.0,False,1,[deleted],My grandma has neck/back pain and little to no...,https://www.reddit.com/r/AskDocs/comments/7nbv...,,
3,DavisTheMagicSheep,This user has not yet been verified.,1514763799,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbu...,7nburb,False,2,0.0,False,1,"I've had a cold for the last couple days now, ...",My ears feel like there is pressure inside of ...,https://www.reddit.com/r/AskDocs/comments/7nbu...,,
4,Dontgetscooped,This user has not yet been verified.,1514763188,self.AskDocs,https://www.reddit.com/r/AskDocs/comments/7nbs...,7nbsw2,False,1,0.0,False,1,(first about me : 32 white male 5 foot 5 225l...,IBS maybe?,https://www.reddit.com/r/AskDocs/comments/7nbs...,,


In [201]:
df_2017.columns

Index(['author', 'author_flair_css_class', 'author_flair_text', 'brand_safe',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'id', 'is_crosspostable', 'is_reddit_media_domain', 'is_self',
       'is_video', 'locked', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'retrieved_on',
       'score', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'thumbnail', 'title', 'url', 'whitelist_status',
       'created', 'post_hint', 'preview', 'banned_by', 'edited',
       'crosspost_parent', 'crosspost_parent_list', 'thumbnail_height',
       'thumbnail_width', 'author_cakeday', 'distinguished', 'removal_reason',
       'suggested_sort', 'approved_at_utc', 'banned_at_utc', 'view_count',
       'gilded', 'media_embed', 'secure_media_embed', 'author_created_utc',
       'author_fullname', 'media', 'secure_media', 'crosspost_subreddits'],
      dtype='object')