In [1]:
# Imports
import pandas as pd

In [2]:
# Load submissions
df = pd.read_csv('mentalhealth_post_data.csv', 
                 usecols = ['id', 'author', 'created_utc', 'link_flair_text', 'num_comments',
                            'score', 'selftext', 'subreddit', 'title', 'total_awards_received'])

# Display
df.head()

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title,link_flair_text,total_awards_received
0,[deleted],1514790708,7ndwjs,1,12,[deleted],mentalhealth,Anyone else tired of the Roller coaster?,,
1,neurolonnet,1514798245,7nebqv,0,1,,mentalhealth,2 x Neurolon Brain Supplement,,
2,netches,1514800286,7nef8w,4,0,"I have clinical depression and panic disorder,...",mentalhealth,Am I even sick enough to justify going to hosp...,,
3,adammiln,1514802485,7nejf6,0,0,,mentalhealth,How To Relax Your Mind | Easy Method To Relax ...,,
4,Valkyria6,1514802717,7nejua,1,1,"Since I was a child, I remember having moments...",mentalhealth,Sudden feeling of deep disgust at random times,,


In [3]:
# Remove rows with deleted authors
df = df[df['author'] != "[deleted]"]

# Remove rows with deleted selftext
df = df[df['selftext'] != "[deleted]"]

# Remove rows with removed selftext
df = df[df['selftext'] != "[removed]"]

# Replace NaNs in selftext with empty string
df['selftext'] = df['selftext'].fillna("")

# Replace NaNs in title with empty string
df['title'] = df['title'].fillna("")

# Replace NaNs in link_flair_text with empty string
df['link_flair_text'] = df['link_flair_text'].fillna("")

# Replace NaNs in total_awards_received with empty string
df['total_awards_received'] = df['total_awards_received'].fillna(0)

# Check how many are 0 in total awards received and the percentage
print("Number of total awards = 0:", len(df[df['total_awards_received'] == 0]))
print("Percentage of total awards = 0:",
      len(df[df['total_awards_received'] == 0])/len(df.index))

# Convert created_utc to date (DD/MM/YYYY)
df['date'] = pd.to_datetime(df['created_utc'], unit='s').dt.strftime('%d/%m/%Y')

# Concatenate title and selftext column together
df['text'] = df['title'] + df['selftext']

# Remove rows where text is empty
df = df[df['text'] != ""]

# Reset index
df = df.reset_index(drop=True)

# Select columns to keep and reorder them
df = df[['subreddit', 'date', 'author', 'id', 'num_comments', 
         'score', 'text', 'link_flair_text']]

# Print how many flairs are empty and the percentage
print("Number of empty flairs:", len(df[df['link_flair_text'] == ""]))
print("Percentage of empty flairs:",
      len(df[df['link_flair_text'] == ""])/len(df.index))

# Display and print size
print(df.head())
print(len(df.index))

# Save to csv
df.to_csv("clean_{}_submission_data.csv".format(df['subreddit'][0]), index=False)

Number of total awards = 0: 221245
Percentage of total awards = 0: 0.9984520822427208
Number of empty flairs: 142331
Percentage of empty flairs: 0.6423226889542755
221588


In [2]:
df = pd.read_csv('clean_mentalhealth_submission_data.csv')

len(df.author.unique())

140576