In [1]:
import os
import json
import pandas as pd
import numpy as np
import datetime

1. The title of the thread CHECK
2. The subreddit that the thread corresponds to CHECK
3. The length of time it has been up on Reddit
4. The number of comments on the thread CHECK

# Features
----
### Required:
- `title`
- `subreddit`
- `num_comments`
- `created_utc`
- `pulled_time`
- `time_on_reddit` (created time minus pulled time)

### Additional Options
- `selftext`
- `post_hint` (needed to determine if the thread contains text or only title, image, link or video)

### Possibly Interesting
- `author`
- `subreddit_subscribers`
- `num_crossposts`
- `over_18`
- `ups`
- `edited`
- `parent_whitelist_status`
- `locked`

### Useless Features
----
Some features were particularly uneccessary

| feature | reason |
| --------| ------ |
| media_only | all False |
| pinned | all False |
| downs | all False |
| is_created_from_ads_ui | all False |
| discussion_type | all None |

### Skewed Features
----
These features could be useful but their skew needs to be considered

| feature | reason |
| --------| ------ |
| stickied | highly skewed to False |
| locked | highly skewed to False |
| author | unlikely to be many duplicates |
| edited | only ~7.5% edited |
| author_cakeday | only ~0.49% cakedays |

In [2]:
FINAL_FEATURES = ['subreddit', 'title', 'author', 'num_comments', 'created_utc', 'pulled_time',
                  'selftext', 'post_hint', 'subreddit_subscribers', 'num_crossposts', 'over_18',
                  'id', 'is_video', 'parent_whitelist_status']

# Functions
----

### Data Import Functions

In [3]:
def get_threads(folder, file):
    """Returns all threads from a specific file.
    Function also adds a time-stamp to the pulled data in the format YYYY-MM-DD-HH"""
    
    #Delcare a blank threads list which will be appended and returned
    threads = []
    
    #Open file
    with open(f"{folder}/{file}", "r") as f:
        pages = json.load(f)
        
        #Each file has 80 pages
        for page in pages:
            #Pull threads from each page, ass a pull_time equal to the file name
            for thread in page["data"]["children"]:
                thread["data"]["pulled_time"] = file
                threads.append(thread)
    return threads

In [4]:
def get_all_threads(data_folder="./data/json/"):
    """Returns all threads from a specifed folder. By default it is ./data/json/
    Function assumes files will be held in a folder for each date"""
    #Declare a blank lists of threads to append to and return
    all_threads = []
    
    #For each folder in ./data/json
    for folder in os.listdir(data_folder):
        
        #read each file and append collected threads to all threads.
        for file in os.listdir(f"{data_folder}/{folder}"):
            all_threads += get_threads(f"{data_folder}/{folder}", file)
    return all_threads

In [5]:
def create_df(threads, features):
    """This function will go through listed threads and check if a desired feature has
    a value. If it does not have a value it will assign np.NaN. This ensures our data
    all has the same shape when creating a DF."""
    
    #Declare a blank list of all rows to be appended and used to create a DF
    all_rows = []
    
    for thread in threads:
        row = []
        for feature in features:
            #Try to append a feature to a row
            try:
                row.append(thread["data"][feature])
            #If it can't append it is because the thread doesn't have the feature.
            #Append np.NaN
            except:
                row.append(np.NaN)
        all_rows.append(row)
        
    #create and return a DF with rows all_rows and features as columns
    return pd.DataFrame(all_rows, columns=features)

### Cleaning Functions

In [6]:
def pulled_time_to_datetime(pulled_time):
    """Converts the file names (pulled_time) to a DateTime Object.
    This function is just for clarity. A lambda would work fine.
    Side Note: I should have used timestamps when pulling. I now know for next time."""
    try:
        components = pulled_time.split("-")
        year = int(components[0])
        month = int(components[1])
        day = int(components[2])
        hour = int(components[3])
        return datetime.datetime(year, month, day, hour)
    
    #If it fails to create a Datetime Object
    except:
        #Check if it already is a dateimte object
        if isinstance(pulled_time, datetime.datetime):
            return pulled_time
        else:
            return np.NaN

In [7]:
def alert(feature, dataframe, check_type=False):
    """Simple Exception Handler.
    Prints the common exceptions expected when cleaning DF"""
    
    if feature not in dataframe.columns:
        print(f'Error: "{feature}" not in feature set!')
    elif check_type != False:
        if isinstance(dataframe[feature].iloc[1], check_type):
            print(f'Error: "{feature}" already a {check_type} object!')
    else:
        print(f'Unkown Error on "{feature}": Debug!')

In [8]:
def clean_df(dataframe):
    """Function cleans the DF and warns if features are not in the DF.
    If run multiple times it will not overwrite existing changes and will instead
    print a notification that something unexpected occured.
    
    NOTE: I plan on changing this. One of my peeves dealing with data is when
    I inadvertantly do additional tranformations that break the data. I wanted
    to basically detect if it was already cleaned before touching it but this
    is way too messy and should just be a series of if conditions."""
    
    #created_utc
    try:
        #created_utc is a timescape.Convert to Datetime
        dataframe["created_utc"] = dataframe["created_utc"].map(
                                   datetime.datetime.utcfromtimestamp)
        #assign the Timezone UTC
        dataframe["created_utc"] = dataframe["created_utc"].dt.tz_localize('UTC')
        #Convert froM UTC to Eastern Time
        dataframe["created_utc"] = dataframe["created_utc"].dt.tz_convert('US/Eastern')
        #Remove Time Zone Info
        dataframe["created_utc"] = dataframe["created_utc"].dt.tz_localize(None)
    except:
        alert("created_utc", dataframe, check_type=datetime.datetime)

    #pulled_time
    try:
        #pulled_time is a string. Add minutes and seconds. Convert to Datetime
        dataframe["pulled_time"] = dataframe["pulled_time"].map(pulled_time_to_datetime)
    except:
        alert("pulled_time", dataframe)
        
    try:
        #This is the only feature engineering in this section
        dataframe["time_on_reddit"] = (dataframe["pulled_time"] - dataframe["created_utc"])
        #convert to seconds
        dataframe["time_on_reddit"] = dataframe["time_on_reddit"].dt.seconds
    except:
        #Feature won't be in DF until after at least one clean.
        #If this is reported to not be in DF then created_utc or pulled_time is not
        alert("time_on_reddit", dataframe)
    
    #post hint
    try:
        #if post_hint is null, text, else post_hint
        dataframe["post_hint"] = np.where(dataframe["post_hint"].isnull(),
                                          "text",
                                          dataframe["post_hint"])
    except:
        alert("post_hint", dataframe)
        
    #parent_whitelist_status
    try:
        #if post_hint is null, text, else post_hint
        dataframe["parent_whitelist_status"] = np.where(dataframe["parent_whitelist_status"].isnull(),
                                                        "unknown",
                                                        dataframe["parent_whitelist_status"])
    except:
        alert("parent_whitelist_status", dataframe)
    
    #assign numerics to parent_whitelist_status
    try:
        value_dict = {"all_ads" : 3,
                      "some_ads" : 2,
                      "no_ads" : 1,
                      "unknown" : 0}
        dataframe["parent_whitelist_status"] = dataframe["parent_whitelist_status"].map(value_dict)
    except:
        alert("parent_whitelist_status", dataframe)

    #selftext
    try:
        #If selftext is "", None, else current text
        dataframe["selftext"] = np.where(dataframe["selftext"] == "",
                                         "None",
                                         dataframe["selftext"])     
    except:
        alert("selftext", dataframe)
        
    dataframe["selftext"] = np.where(dataframe["selftext"].isnull(),
                                     "None",
                                     dataframe["selftext"])
        
    return dataframe

### Note:
When possible adjust to improve efficiency based on Sophie's advise

In [9]:
def remove_duplicates(dataframe):
    """This is EXTREMELY inefficient. It takes over an hour to run.
    I believe the solution is to use better pandas slections as well as
    the mx feature. It works but it's definitely a brute force solution for now"""
    
    dups = dataframe["id"].value_counts().index[dataframe["id"].value_counts()>1]
    dataframe["pulled_time"] = pd.to_datetime(dataframe["pulled_time"])
    df = dataframe
    
    print(f"Removing {len(dups)} duplicates from DataFrame.")
    for i, thread_id in enumerate(dups):
        if i % 10000 == 0:
            print(f"Removing {i}/{len(dups)} from DataFrame")
            print(f"Current Time: {datetime.datetime.now()}")
        newest = datetime.datetime.min
        keep = -2
        
        for i in dataframe[dataframe["id"]==thread_id].index:
            if dataframe.iloc[i]["pulled_time"] > newest:
                keep = i
                newest = dataframe.iloc[i]["pulled_time"]
        arr = dataframe[dataframe["id"]==thread_id].index
        remove = np.delete(arr, np.where(arr == keep))
        df = df[~df.index.isin(remove)]
    return df

### Side Use
----
Function not actively used in notebook. Exists to collect all features from imported data.

In [11]:
def determine_all_features(threads):
    """Simple feature to determine all functions in pulled threads.
    This is only used to determine which features exist for EDA.
    Should be used in conjunction with pull all threads."""
    features = []
    for thread in threads:
        features += thread["data"].keys()
    return list(set(features))

## Main Function
----
#### Step 1: Get all information collected in the API pulls
Collect all threads that were pulled by `pull_threads_executable.py`. The files are stored in `"./data/json/"`

The raw data will not be included in full to the GitHub Reposistory as all of the files combined are quite large but a small portion will be included to show proof of concept. The file size is because I opted to pull and store all of the data from the API so I would have access to every possible feature during EDA. This actually worked out quite well as `post_hint` turned out to be incredibly relevant to determining whether the post type and I did not dicover this until late in the data collection process.

#### Step 2: Create a Pandas DataFrame of selected features
Iterate over all of the threads and ensure there is a value for every selected feature. If there is no value for a feature then assign np.NaN.

#### Step 3: Clean the data
There is not much data cleaning due to using the API. Cleaning Breakdown:
- `created_utc` - Changed to `DateTime`. Convert to EST to align with `pulled_time`. Remove TimeZone
- `pulled_time` - Correct the format to `YYYY-MM-DD HH:MM:SS` and change to `DateTime`
- `time_on_reddit` - Feature Engineering.
- `post_hint` - Change `np.NaN` to `text`
- `parent_whitelist_status` - There are a few nulls in here. Change to `"unkown"`
- `edited` - Assign `True` to edited threads instead of `Timestamp`
- `author_cakeday` - change `np.NaN` to `False` if not the author's cakeday
- `selftext` - Replace `np.NaN` with `None`

In [12]:
def main():
    threads = get_all_threads()
    thread_df = create_df(threads, FINAL_FEATURES)
    thread_df = clean_df(thread_df)
    thread_df = remove_duplicates(thread_df)
    
    thread_df.to_csv("../data/thread_df.csv", index=False)

# Create DataFrame
----
***Note:*** *I have disabled the main function as the json files are too large for me to put on github.
Some raw json files were included for testing purposes and a backup file is in /data in case information is overwritten*

In [13]:
#Caution: json files not included on Github. Running will result in data loss.
#if __name__ == "__main__":
#    main()

Removing 39406 duplicates from DataFrame.
Removing 0/39406 from DataFrame
Current Time: 2022-01-08 20:37:42.508604
Removing 10000/39406 from DataFrame
Current Time: 2022-01-08 20:53:49.545806
Removing 20000/39406 from DataFrame
Current Time: 2022-01-08 21:08:28.567072
Removing 30000/39406 from DataFrame
Current Time: 2022-01-08 21:21:53.714634


In [14]:
df = pd.read_csv("../data/thread_df.csv")
df.head()

Unnamed: 0,subreddit,title,author,num_comments,created_utc,pulled_time,selftext,post_hint,subreddit_subscribers,num_crossposts,over_18,id,is_video,parent_whitelist_status,time_on_reddit
0,Cringetopia,I hate this app,Highground69420,6388,2021-12-26 10:25:10,2021-12-26 21:00:00,,image,1630721,6,True,royzpp,False,1,38090
1,OldSchoolCool,Weed bust in 1970s,hugginuu,362,2021-12-26 17:21:40,2021-12-26 21:00:00,,image,15540465,0,False,rp7ci7,False,3,13100
2,Music,Music elitism is getting annoying.,MeldNoFake,3519,2021-12-26 05:37:53,2021-12-26 21:00:00,"Yes, you can listen to Pink Floyd, The Beatles...",text,28558871,0,False,roujf1,False,3,55327
3,news,Covid: Travel misery for tens of thousands wit...,FullAd2253,2620,2021-12-26 07:13:24,2021-12-26 21:00:00,,link,23982213,2,False,rovth5,False,3,49596
4,pics,Hélène Boudreu's actual graduation picture fro...,eliazer1,827,2021-12-26 17:34:00,2021-12-26 21:00:00,,image,28396351,5,True,rp7lry,False,3,12360
