In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from cleantext import clean

## Switching to a Different Tool
<b>new scraper</b>: timesearch, https://github.com/voussoir/timesearch <br>
quickly access in a <b>database view</b>: https://sqlitebrowser.org/dl/#macos
- the old scraper (DownloaderForReddit) cannot bypass the 1000 post limit

#### Submission (Post) Columns
- idstr: (string) identifiers for content
- created: (string) unix timestamp
- title: (string) post title
- selftext: (string) post body text
- score: (int) post score calculated by upvote-downvote

#### Comment Columns
- ...
- parent: (string) 'idstr' of the parent comment/reply
- submission: (string) 'idstr' of the submission that this comment belongs to
- body: (string) comment body text

In [2]:
post_df = pd.read_csv('submissions.csv', usecols = ['idstr','created','title','selftext','score'], dtype={'created': 'object'})
comment_df = pd.read_csv('comments.csv', usecols = ['idstr','created','parent','submission','body','score'], dtype={'created': 'object'})

In [3]:
display(post_df.sample(5))
display(comment_df.sample(5))

Unnamed: 0,idstr,created,title,selftext,score
7705,t3_f4h76j,1581806996,Online Student Looking for which maker spaces ...,Hello! I recently joined the university as an ...,3
15461,t3_mjvoyn,1617540770,I wonder if UMich will do the same thing,,11
6298,t3_e55fni,1575320435,LS&amp;A Course Guide Extension Not Working,I tried adding this Google Chrome extension th...,1
1022,t3_a5f0p8,1544591016,Ling 209,"Anyone here in ling 209. If so, how are you st...",1
11422,t3_hwf440,1595508797,PSA Regarding Switching out of North Campus Dorms,Spare yourself the trouble of asking. No one i...,53


Unnamed: 0,idstr,created,parent,submission,body,score
136525,t1_h96esps,1629132956,t1_h968qrp,t3_p5ign6,A refresher is always worth it. \nYou'll find ...,38.0
147037,t1_hfct49n,1633358559,t3_q0v7zc,t3_q0v7zc,My fav product is back in stock at Trader Joe’s,1.0
239357,t1_jaomqyb,1677797767,t3_11gb91p,t3_11gb91p,have you tried talking to someone working ther...,2.0
156495,t1_hnhbj4p,1638811208,t3_racbna,t3_racbna,"Which 4, specifically? That is going to be tou...",10.0
23271,t1_em4ne69,1556622041,t1_em40hi2,t3_bitb63,"Hmm, I should definitely look into this.",1.0


## Modifications

#### Comment Table: may have an anomaly column

In [4]:
comment_df.sort_values(by='created', ascending=False).head(5)

Unnamed: 0,idstr,created,parent,submission,body,score
185762,1,uofm,448,,,
244177,t1_jdt465f,1679874767,t1_jds8hiw,t3_121lwen,I think violence and retaliatory ethnic cleani...,1.0
244176,t1_jdt43x2,1679874739,t3_122x93t,t3_122x93t,Little to none,1.0
244175,t1_jdt3306,1679874256,t1_jdszdv3,t3_123162e,The timing seems bad is all. Maybe if they cou...,1.0
244174,t1_jdt2wfo,1679874169,t1_jdsqyxu,t3_123162e,They bargained again today.,1.0


In [5]:
# Drop by index
comment_df.drop([185762], inplace=True)

#### Comment and Submission Tables: convert unix time to UTC

In [6]:
# Convert based on seconds elapsed since epoch
post_df['created'] = pd.to_datetime(post_df['created'], unit='s')
comment_df['created'] = pd.to_datetime(comment_df['created'], unit='s')

print('Earliest post created at %s. Latest post created at %s.' % (post_df['created'].min(), post_df['created'].max()))

Earliest post created at 2018-09-01 07:17:52. Latest post created at 2023-03-26 23:10:20.


#### Comment Table: some early post (2010-ish) got mixed in when testing the scraper

In [7]:
print('Earliest comment created at %s. Latest comment created at %s.' % (comment_df['created'].min(), comment_df['created'].max()))

Earliest comment created at 2010-08-13 22:37:19. Latest comment created at 2023-03-26 23:52:47.


In [8]:
comment_df = comment_df[comment_df['created'] >= '2018-09-01 07:17:52']
print('Earliest comment created at %s. Latest comment created at %s.' % (comment_df['created'].min(), comment_df['created'].max()))

Earliest comment created at 2018-09-01 09:40:47. Latest comment created at 2023-03-26 23:52:47.


#### Text content in general: emoji, linebreak, '[deleted]', '[removed]' are not helpful
- clean-text, https://github.com/jfilter/clean-text/blob/main/README.md

In [9]:
post_df['selftext'] = post_df['selftext'].replace({'\[deleted\]':'', '\[removed\]':'', '\n':'. '}, regex=True)
comment_df['body'] = comment_df['body'].replace({'\[deleted\]':'', '\[removed\]':'', '\n':'. '}, regex=True)

In [10]:
post_df['title'] = post_df['title'].apply(lambda s: clean(s, no_emoji=True, lower=False))
post_df['selftext'] = post_df['selftext'].apply(lambda s: clean(s, no_emoji=True, lower=False))

comment_df['body'] = comment_df['body'].apply(lambda s: clean(s, no_emoji=True, lower=False))

#### Submission Table: combine 'title' and 'selftext' to serve as a single input for models

In [11]:
# Putting a period in between. 
post_df['title_text_combined'] = post_df[['title','selftext']].apply(lambda x : '{}. {}'.format(x[0],x[1]), axis=1)

In [12]:
post_df.head()

Unnamed: 0,idstr,created,title,selftext,score,title_text_combined
0,t3_9c188n,2018-09-01 07:17:52,Prelaw Frats,"Hey everyone, I'm hoping to rush a prelaw frat...",0,"Prelaw Frats. Hey everyone, I'm hoping to rush..."
1,t3_9c2dxp,2018-09-01 11:30:10,Can you leave lecture early?,,8,Can you leave lecture early?.
2,t3_9c2ui7,2018-09-01 12:57:28,Thanksgiving Recess,I was wondering how strict it was to be in cla...,1,Thanksgiving Recess. I was wondering how stric...
3,t3_9c2ywe,2018-09-01 13:16:49,Dual degree question,,0,Dual degree question.
4,t3_9c35x9,2018-09-01 13:47:46,LSA,,0,LSA.


In [13]:
comment_df.head()

Unnamed: 0,idstr,created,parent,submission,body,score
4755,t1_e57ddqc,2018-09-01 09:40:47,t1_e55e71k,t3_9bmgmx,this is maybe true but I'll still take a smell...,1.0
4756,t1_e57gijd,2018-09-01 11:34:08,t3_9c2dxp,t3_9c2dxp,"It's not a big deal, just be respectful about ...",56.0
4757,t1_e57ih7p,2018-09-01 12:28:47,t3_9c2dxp,t3_9c2dxp,They also record 280 lectures online so you ca...,17.0
4758,t1_e57khfz,2018-09-01 13:13:55,t3_9c2dxp,t3_9c2dxp,"No, the principal will find out and put it on ...",90.0
4759,t1_e57kio1,2018-09-01 13:14:36,t3_9c2dxp,t3_9c2dxp,May also want to let your prof know that you w...,12.0


## Final Checks

In [14]:
post_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21327 entries, 0 to 21326
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   idstr                21327 non-null  object        
 1   created              21327 non-null  datetime64[ns]
 2   title                21327 non-null  object        
 3   selftext             21327 non-null  object        
 4   score                21327 non-null  int64         
 5   title_text_combined  21327 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 999.8+ KB


In [15]:
comment_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239422 entries, 4755 to 244177
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   idstr       239422 non-null  object        
 1   created     239422 non-null  datetime64[ns]
 2   parent      239422 non-null  object        
 3   submission  239422 non-null  object        
 4   body        239422 non-null  object        
 5   score       239421 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 12.8+ MB


#### Comments with no matching submission in the submission table

In [16]:
comment_df.submission.isin(post_df.idstr).value_counts()

True     130277
False    109145
Name: submission, dtype: int64

In [17]:
# Take a look at entries at 244136 and 244133: those are actual comments under "How to ask for a (small amount) scholarship..."
# 244162 and 244161 are simply not found on reddit
stray_comment_df = comment_df[~comment_df.submission.isin(post_df.idstr)]
stray_comment_df.sort_values(by='created', ascending=False).head(5)

Unnamed: 0,idstr,created,parent,submission,body,score
244162,t1_jdsu9d5,2023-03-26 22:37:00,t1_jdofwyk,t3_121hldu,what's that lol?,1.0
244161,t1_jdsu6fq,2023-03-26 22:36:24,t1_jds1cwy,t3_121hldu,what's that? Tried looking it up but couldn't ...,1.0
244136,t1_jds6dsq,2023-03-26 19:45:20,t1_jdq13h3,t3_122ca64,You'd be surprised how much extra funding you ...,1.0
244133,t1_jds34un,2023-03-26 19:22:34,t3_122ca64,t3_122ca64,Your odds may be better applying for small sch...,1.0
244082,t1_jdrb6v8,2023-03-26 16:04:51,t3_x0rlho,t3_x0rlho,I am looking for 2 tickets for the CoE graduat...,1.0


In [18]:
# Other comments under the same post are not matched either
stray_comment_df[stray_comment_df.submission == 't3_122ca64']

Unnamed: 0,idstr,created,parent,submission,body,score
244055,t1_jdq13h3,2023-03-26 08:04:51,t3_122ca64,t3_122ca64,Lol what. Everyone wants a scholarship dude. T...,22.0
244059,t1_jdqg1u9,2023-03-26 11:38:13,t3_122ca64,t3_122ca64,I wouldn't say this has 0% chance of working b...,20.0
244061,t1_jdqjc5l,2023-03-26 12:15:41,t3_122ca64,t3_122ca64,You can ask but I wouldn't use this reason as ...,3.0
244066,t1_jdqsfek,2023-03-26 13:42:22,t3_122ca64,t3_122ca64,"I tried something similar, I asked if any of m...",3.0
244069,t1_jdqtxnh,2023-03-26 13:55:01,t3_122ca64,t3_122ca64,Sadly not gonna happen. Depending on the lengt...,2.0
244133,t1_jds34un,2023-03-26 19:22:34,t3_122ca64,t3_122ca64,Your odds may be better applying for small sch...,1.0
244136,t1_jds6dsq,2023-03-26 19:45:20,t1_jdq13h3,t3_122ca64,You'd be surprised how much extra funding you ...,1.0


In [19]:
# The original post has not been scraped by the scraper
post_df[post_df.title.str.contains('scholarship for a')]

Unnamed: 0,idstr,created,title,selftext,score,title_text_combined


#### Option 1: Proceed with comments that can be matched to its submission for analysis
#### **Option 2: Use all comments as usual and only take advantage of the structure when it is available

In [20]:
not_stray_comment_df = comment_df[comment_df.submission.isin(post_df.idstr)].copy()

In [21]:
not_stray_comment_df.idstr.nunique() == len(not_stray_comment_df)

True

In [22]:
post_df.idstr.nunique() == len(post_df)

True

#### Export

In [26]:
post_df.to_csv('posts.csv', index=False)
comment_df.to_csv('comments.csv', index=False)