In [1]:
# Imports
import pandas as pd
import os

In [2]:
"""
Desc: Basic preprocessing function
Input: f (string) - file name
Output: Cleaned csv file saved to clean_data folder
        and prints basic statistics to console
"""
def basic_data_preproc(f):
    
    try:
        # Load submissions
        df = pd.read_csv(f, 
                         usecols = ['id', 'author', 'created_utc', 'link_flair_text', 'num_comments',
                                    'score', 'selftext', 'subreddit', 'title', 'total_awards_received'])

        # Remove rows with deleted authors and deleted/removed selftext
        df = df[df['author'] != "[deleted]"]
        df = df[df['selftext'] != "[deleted]"]
        df = df[df['selftext'] != "[removed]"]

        # Keep rows in created_utc if it is int
        df = df.loc[df['created_utc'].apply(type) == int]

        # Replace NaNs in selftext, title, and link_flair with empty string
        df['selftext'] = df['selftext'].fillna("")
        df['title'] = df['title'].fillna("")
        df['link_flair_text'] = df['link_flair_text'].fillna("")

        # Replace NaNs in total_awards_received with 0
        df['total_awards_received'] = df['total_awards_received'].fillna(0)

        # Check how many are 0 in total awards received and the percentage
        print("Number of total awards = 0:", len(df[df['total_awards_received'] == 0]))
        print("Percentage of total awards = 0:",
              len(df[df['total_awards_received'] == 0])/len(df.index))

        # Convert created_utc to date (DD/MM/YYYY)
        df['date'] = pd.to_datetime(df['created_utc'], unit='s').dt.strftime('%d/%m/%Y')

        # Concatenate title and selftext column together
        df['text'] = df['title'] + " " + df['selftext']

        # Remove rows where text is empty
        df = df[df['text'] != ""]

        # Reset index
        df = df.reset_index(drop=True)

        # Select columns to keep and reorder them
        df = df[['subreddit', 'date', 'author', 'id', 'num_comments', 
                 'score', 'text', 'link_flair_text']]

        # Print how many flairs are empty and the percentage
        print("Number of empty flairs:", len(df[df['link_flair_text'] == ""]))
        print("Percentage of empty flairs:",
              len(df[df['link_flair_text'] == ""])/len(df.index))

        # Print number of unique authors
        print("Number of unique authors", len(df.author.unique()))

        # Display and print size
        print(df.head())
        print(len(df.index))

        # Save to csv with subreddit name and year
        df.to_csv("./clean_data/clean_{}_{}_submission_data.csv".format(df['subreddit'][0],
                                                                        f[-8:-4]),
                  index=False)
    except:
        print("The file you are trying to read either does not exist or is missing a column.\n" +
              "The columns are: id, author, created_utc, link_flair_text, num_comments," +
              "score, selftext, subreddit, title, total_awards_received")

In [3]:
# If the clean_data forlder does not exist, create it
if not os.path.exists("./clean_data/"):
    os.mkdir("./clean_data/")

# Read the CSV files into a list
try:
    # List of csv files
    csvs = [f.name for f in os.scandir("./raw_data/") if f.name.endswith(".csv")]
    
    # Remove hidden directories
    csvs = [f for f in csvs if not f.startswith('.')]
    
    # Append directory as prefix to strings in list
    csvs = ['./raw_data/' + f for f in csvs]
    
    print(csvs)
except:
    print("The raw_data folder does not exist")

['./raw_data/ADHD_post_data_2019.csv', './raw_data/ADHD_post_data_2020.csv', './raw_data/anxiety_post_data_2019.csv', './raw_data/anxiety_post_data_2020.csv', './raw_data/depression_help_post_data_2020.csv', './raw_data/mentalhealth_post_data_L3YR.csv', './raw_data/overcoming_post_data_2020.csv', './raw_data/sad_post_data_2019.csv', './raw_data/sad_post_data_2020.csv']


In [4]:
# Apply basic preprocessing to each csv file
for c in csvs:
    print(c)
    basic_data_preproc(c)

./raw_data/ADHD_post_data_2019.csv


  if (await self.run_code(code, result,  async_=asy)):


Number of total awards = 0: 38184
Percentage of total awards = 0: 1.0
Number of empty flairs: 28313
Percentage of empty flairs: 0.7414885816048606
Number of unique authors 21790
  subreddit        date                author      id  num_comments score  \
0      ADHD  01/01/2019         DirtJunkie133  abd11x          13.0     1   
1      ADHD  01/01/2019  Lin_the_pillow_artis  abd7q9           5.0     1   
2      ADHD  01/01/2019         Fleetfeathers  abda0t          12.0     1   
3      ADHD  01/01/2019         UnleashedDebs  abdd13           4.0     1   
4      ADHD  01/01/2019              liluglee  abdj4w           1.0     1   

                                                text link_flair_text  
0  Recently diagnosed, need to talk to others who...                  
1  Really annoyed at my familys drunk friends So ...                  
2  The medication journey: a current disappointme...                  
3  Wearables, REM sleep detected while gaming not...                  
4  P

  if (await self.run_code(code, result,  async_=asy)):


Number of total awards = 0: 13142
Percentage of total awards = 0: 0.9901303397875386
Number of empty flairs: 12869
Percentage of empty flairs: 0.9695622692684397
Number of unique authors 9940
  subreddit        date          author      id  num_comments  score  \
0       sad  01/01/2020  xXBleachlessXx  eib72q             1      1   
1       sad  01/01/2020  RogueGamerFoxx  eib8nq             2      1   
2       sad  01/01/2020          wocka3  eibgli             0      1   
3       sad  01/01/2020  bobthebillyman  eibu9r             3      1   
4       sad  01/01/2020  TheWeebWalking  eibuvy             5      1   

                                                text link_flair_text  
0  The Last Hug It was the day. The day she had b...                  
1  New Years Blues I so much want to feel optimis...                  
2  It’s been 14 days I made a commitment, unfortu...                  
3  New year🙃 At midnight I text my gf a really he...                  
4    The memes will 