In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from nltk import pos_tag, word_tokenize
from wordcloud import WordCloud
import subprocess
import zipfile
import os

In [2]:
# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Download for better lemmatization support
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/kostas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kostas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/kostas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kostas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/kostas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Step 1: Define the command to download the dataset
command = [
    "kaggle", "datasets", "download",
    "-d", "rishabhkausish/reddit-depression-dataset",
    "-p", "./reddit-depression-dataset"
]

# Step 2: Execute the command using subprocess.run()
try:
    result = subprocess.run(command, check=True, text=True, capture_output=True)
    print(result.stdout)
    print("Dataset downloaded successfully.")
except subprocess.CalledProcessError as e:
    print("Error occurred while downloading the dataset:")
    print(e.stderr)
    exit()

# Step 3: Define the path to the downloaded ZIP file
zip_path = "./reddit-depression-dataset/reddit-depression-dataset.zip"
extract_path = "./reddit-depression-dataset"

# Step 4: Check if the ZIP file exists and extract it
if os.path.exists(zip_path):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print("Dataset extracted successfully.")
    except zipfile.BadZipFile:
        print("Error: The ZIP file is corrupted.")
        exit()
else:
    print("Error: ZIP file not found.")
    exit()

# Step 5: Locate the CSV file and read it with pandas
csv_file_path = os.path.join(extract_path, "reddit_depression_dataset.csv")

if os.path.exists(csv_file_path):
    depression = pd.read_csv(csv_file_path)
    print("Dataset loaded successfully.")
else:
    print("Error: CSV file not found in the extracted folder.")
    exit()


Dataset URL: https://www.kaggle.com/datasets/rishabhkausish/reddit-depression-dataset
License(s): CC0-1.0
Downloading reddit-depression-dataset.zip to ./reddit-depression-dataset


Dataset downloaded successfully.
Dataset extracted successfully.


  depression = pd.read_csv(csv_file_path)


Dataset loaded successfully.


In [4]:
# Convert the dataset to tsv and compress it
depression.to_csv("reddit_depression_dataset.tsv.gz", sep='\t', index=False, compression='gzip')

In [3]:
# Read the compressed tsv file and display columns
depression = pd.read_csv('reddit_depression_dataset.tsv.gz', sep='\t')
print(depression.head())

  depression = pd.read_csv('reddit_depression_dataset.tsv.gz', sep='\t')


  Unnamed: 0     subreddit                                              title  \
0      47951  DeepThoughts                             Deep thoughts underdog   
1      47952  DeepThoughts  I like this sub, there's only two posts yet I ...   
2      47957  DeepThoughts                                           Rebirth!   
3      47959  DeepThoughts  "I want to be like water. I want to slip throu...   
4      47960  DeepThoughts                                          Who am I?   

                                                body  upvotes   created_utc  \
0  Only when we start considering ourselves, the ...      4.0  1.405309e+09   
1  Anyway: Human Morality is a joke so long as th...      4.0  1.410568e+09   
2  Hello. \nI am the new guy in charge here (Besi...      6.0  1.416458e+09   
3                                                NaN     25.0  1.416512e+09   
4  You could take any one cell in my body and kil...      5.0  1.416516e+09   

   num_comments  label  
0           N

In [6]:
# Display descriptive statistics for numerical columns
# Get an overview of the DataFrame, including counts of non-null entries per column and data types
print(depression.info())
print("-" * 50)  

# Some info about the column data
for column in depression.columns:
    unique_values = depression[column].unique()
    print(f"Column '{column}' has {len(unique_values)} unique values.")
    print(unique_values)
    print("-" * 50)  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4551065 entries, 0 to 4551064
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Unnamed: 0    object 
 1   subreddit     object 
 2   title         object 
 3   body          object 
 4   upvotes       float64
 5   created_utc   float64
 6   num_comments  float64
 7   label         float64
dtypes: float64(4), object(4)
memory usage: 277.8+ MB
None
--------------------------------------------------
Column 'Unnamed: 0' has 2471259 unique values.
['47951' '47952' '47957' ... 10742232 10742243 10742244]
--------------------------------------------------
Column 'subreddit' has 59 unique values.
['DeepThoughts' 'teenagers' '4.0' nan '32.0' '15' '23.0' '13.0' '33.0'
 '8.0' '26.0' '6' '30' '5' '6.0' '11.0' '12' '16' '7' '4' '61' '10' '7.0'
 '5.0' '15.0' '9' '29.0' '12.0' '17.0' '9.0' '8' '42.0' '107' '11' '10.0'
 '19.0' '18.0' 'happy' '146.0' '25.0' '22.0' 'SuicideWatch' '1402326041'
 '16.0' 'depression'

In [4]:
# Count of NaNs in each column
print(depression.isna().sum())

Unnamed: 0       803448
subreddit       1808267
title           1808270
body            2269302
upvotes         2080353
created_utc     2080535
num_comments    2194402
label           2080535
dtype: int64


### Preprocessing
1) Deal with NA values

In [5]:
# Drop rows where 'body' column is NaN or empty
depression = depression.dropna(subset=['body']).loc[depression['body'].str.strip() != '']

# I believe there is no reason to drop this one, but since they are only 9 rows, whatever
depression = depression.dropna(subset=['Unnamed: 0'])

# Drop rows where all columns except 'subreddit' and 'Unnamed: 0' are NaN
depression = depression.dropna(how='all', subset=[col for col in depression.columns if col not in ['subreddit', 'Unnamed: 0']])
print(depression.info())
print(depression.isna().sum())
print(len(depression['Unnamed: 0'].unique()))

<class 'pandas.core.frame.DataFrame'>
Index: 2280916 entries, 0 to 4551064
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Unnamed: 0    object 
 1   subreddit     object 
 2   title         object 
 3   body          object 
 4   upvotes       float64
 5   created_utc   float64
 6   num_comments  float64
 7   label         float64
dtypes: float64(4), object(4)
memory usage: 156.6+ MB
None
Unnamed: 0           0
subreddit            0
title                2
body                 0
upvotes         272085
created_utc     272253
num_comments    356458
label           272253
dtype: int64
2009018


In [6]:
# Replace NaN values in 'num_comments' column with 0
depression['num_comments'] = depression['num_comments'].fillna(0)

In [8]:
# Should we do the same with 'upvotes' ?
# Check if there is a 0 in the 'upvotes' column
has_zero = (depression['upvotes'] == 0).any()
print(has_zero)

True


2. Deal with variable type and 'created_utc' variable

In [9]:
# We should convert the columns 'upvotes', 'num_comments' and 'label' to type int32
for column_name in ['upvotes', 'num_comments', 'label']:
    depression[column_name] = depression[column_name].astype('Int64')
print(depression.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2280916 entries, 0 to 4551064
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Unnamed: 0    object 
 1   subreddit     object 
 2   title         object 
 3   body          object 
 4   upvotes       Int64  
 5   created_utc   float64
 6   num_comments  Int64  
 7   label         Int64  
dtypes: Int64(3), float64(1), object(4)
memory usage: 163.1+ MB
None


In [10]:
# Convert UTC epoch time to human-readable time (as another column)
depression['created_datetime'] = pd.to_datetime(depression['created_utc'], unit='s')
print(depression[['created_datetime']].head())

# Drop 'created_utc' column
depression= depression.drop(columns=['created_utc']) 

     created_datetime
0 2014-07-14 03:35:09
1 2014-09-13 00:31:19
2 2014-11-20 04:31:58
4 2014-11-20 20:36:52
5 2014-11-22 19:17:39


### Language processing part

1. Make use of Computerome to apply MapReduce

The lemmatizer uses only the 'body' column which contains the actual text. We need to save our preprocessed data set on a tsv.gz file, upload it on Computerome and perform lemmatization in chunks there (use of many nodes and cores to parallelize). 

This is essential since the lemmatization of 10.000 entries takes ~2m to execute, so ~2.2m entries would take ~8h. 

To make the process even faster and smoother, instead of working with the whole data set, we create a subset containing only the 'body' column plus an identifier column in order to be possible to merge it back later.

Before that, we find that there are duplicated text values in the 'body' column. Some shorter phrases can be genuine post by different people (phrases like 'I need help') while longer ones might be actual duplicates that we want to discard.

In [11]:
# Define a word count threshold (e.g., 5 words)
word_count_threshold = 5

# Identify duplicates in the dataset
print("Identifying duplicates...")
duplicates_mask = depression['body'].duplicated(keep=False)

# Filter the original dataframe to get only duplicates
print("Extracting duplicate entries...")
duplicates = depression[duplicates_mask].copy()
print(f"Number of duplicate entries: {len(duplicates)}")

# Count words only in the duplicate entries
print("Counting words in duplicate entries...")
duplicates['word_count'] = duplicates['body'].str.split().str.len()

# Separate the duplicates into short and long ones based on word count
print("Separating short and long duplicates...")
short_duplicates = duplicates[duplicates['word_count'] <= word_count_threshold]
long_duplicates = duplicates[duplicates['word_count'] > word_count_threshold]
print(f"Number of short duplicates: {len(short_duplicates)}")
print(f"Number of long duplicates: {len(long_duplicates)}")

# Keep all short posts, including short duplicates
print("Filtering the dataset to keep all short posts...")

# Create a mask to keep non-duplicate rows or short duplicates
short_duplicate_indices = short_duplicates.index
keep_mask = (~duplicates_mask) | (depression.index.isin(short_duplicate_indices))

# Apply the mask to get the filtered DataFrame
depression_filtered = depression[keep_mask]

# For long posts, drop duplicates but keep the first occurrence
print("Handling long duplicates...")
depression_filtered_long = long_duplicates.drop_duplicates(subset='body', keep='first')

# Concatenate filtered data: keep all short posts and filtered long posts without duplicates
print("Combining short and filtered long posts...")
final_filtered_depression = pd.concat([depression_filtered, depression_filtered_long], ignore_index=True).drop_duplicates().reset_index(drop=True)

# Drop the 'word_count' column from final_filtered_depression if it exists
if 'word_count' in final_filtered_depression.columns:
    final_filtered_depression = final_filtered_depression.drop(columns=['word_count'])

# Output the final shape
print(f"Final filtered dataset shape: {final_filtered_depression.shape}")


Identifying duplicates...
Extracting duplicate entries...
Number of duplicate entries: 351117
Counting words in duplicate entries...
Separating short and long duplicates...
Number of short duplicates: 67229
Number of long duplicates: 283888
Filtering the dataset to keep all short posts...
Handling long duplicates...
Combining short and filtered long posts...
Final filtered dataset shape: (2000382, 8)


In [18]:
final_filtered_depression

Unnamed: 0.1,Unnamed: 0,subreddit,title,body,upvotes,num_comments,label,created_datetime
0,47951,DeepThoughts,Deep thoughts underdog,"Only when we start considering ourselves, the ...",4,0,0,2014-07-14 03:35:09
1,47952,DeepThoughts,"I like this sub, there's only two posts yet I ...",Anyway: Human Morality is a joke so long as th...,4,1,0,2014-09-13 00:31:19
2,47957,DeepThoughts,Rebirth!,Hello. \nI am the new guy in charge here (Besi...,6,1,0,2014-11-20 04:31:58
3,47960,DeepThoughts,Who am I?,You could take any one cell in my body and kil...,5,4,0,2014-11-20 20:36:52
4,47969,DeepThoughts,What is the limit of the knowledge and power a...,"Personally, I think it's infinite. We will alw...",8,23,0,2014-11-22 19:17:39
...,...,...,...,...,...,...,...,...
2000377,10370336,depression,[M/25] Is it even worth it anymore?,Hello and thanks for reading this.\n\nI'm sorr...,5,0,1,2020-12-15 21:45:22
2000378,10438776,depression,"Working with depression, the never ending sick...",Having to go to work everyday while depressed ...,7,8,1,2021-04-06 04:56:33
2000379,10449200,depression,"Hi, bye.",I just wanna fuckin die\n\nI just wanna feel a...,9,7,1,2021-04-24 18:29:30
2000380,10468508,depression,Writing a letter to depression,Hey! I wrote a post like this before and recei...,5,0,1,2021-05-30 15:54:57


Prepare the file to be submitted to Computerome for MapReduce

In [None]:
# Reset the index of the DataFrame
final_filtered_depression.reset_index(drop=True, inplace=True)
# Add a unique identifier
final_filtered_depression['unique_id'] = final_filtered_depression.index

# Define a function to perform all the replacements (tab and multiple spaces) in one go
# Their existence could potentially create problems during the process on Computerome
def clean_text(text):
    return re.sub(r'\s+', ' ', re.sub(r'[\n\t]', ' ', text))

# Apply the function to the 'body' column
final_filtered_depression['body'] = final_filtered_depression['body'].apply(lambda x: clean_text(x) if isinstance(x, str) else x)

# Verify the count of tab characters in the 'body' column before saving
tab_count = final_filtered_depression['body'].apply(lambda x: x.count('\t')).sum()
print(f"Number of tab characters in 'body' column: {tab_count}")

# Verify the count of multiple continuous spaces in the 'body' column before saving
multiple_spaces_count = final_filtered_depression['body'].apply(lambda x: len(re.findall(r'\s{2,}', x))).sum()
print(f"Number of occurrences of multiple continuous spaces in 'body' column: {multiple_spaces_count}")

# Select the necessery columns
depression_computerome = final_filtered_depression[['body', 'unique_id']]

# Write the dataset that "waits" to be merged with the Computerome output
final_filtered_depression.to_csv("waiting_for_merge.tsv.gz", sep='\t', index=False, compression='gzip', header=True)

# Write the dataset to be processed on computerome to a file
depression_computerome.to_csv("preprocessed_depression_dataset_full.tsv.gz", sep='\t', index=False, compression='gzip', header=False)

Number of tab characters in 'body' column: 0
Number of occurrences of multiple continuous spaces in 'body' column: 0


: 

In [26]:
final_filtered_depression

Unnamed: 0.1,Unnamed: 0,subreddit,title,body,upvotes,num_comments,label,created_datetime,unique_id
0,47951,DeepThoughts,Deep thoughts underdog,"Only when we start considering ourselves, the ...",4,0,0,2014-07-14 03:35:09,0
1,47952,DeepThoughts,"I like this sub, there's only two posts yet I ...",Anyway: Human Morality is a joke so long as th...,4,1,0,2014-09-13 00:31:19,1
2,47957,DeepThoughts,Rebirth!,Hello. I am the new guy in charge here (Beside...,6,1,0,2014-11-20 04:31:58,2
3,47960,DeepThoughts,Who am I?,You could take any one cell in my body and kil...,5,4,0,2014-11-20 20:36:52,3
4,47969,DeepThoughts,What is the limit of the knowledge and power a...,"Personally, I think it's infinite. We will alw...",8,23,0,2014-11-22 19:17:39,4
...,...,...,...,...,...,...,...,...,...
2000377,10370336,depression,[M/25] Is it even worth it anymore?,Hello and thanks for reading this. I'm sorry f...,5,0,1,2020-12-15 21:45:22,2000377
2000378,10438776,depression,"Working with depression, the never ending sick...",Having to go to work everyday while depressed ...,7,8,1,2021-04-06 04:56:33,2000378
2000379,10449200,depression,"Hi, bye.",I just wanna fuckin die I just wanna feel alri...,9,7,1,2021-04-24 18:29:30,2000379
2000380,10468508,depression,Writing a letter to depression,Hey! I wrote a post like this before and recei...,5,0,1,2021-05-30 15:54:57,2000380


In [27]:
depression_computerome

Unnamed: 0,body,unique_id
0,"Only when we start considering ourselves, the ...",0
1,Anyway: Human Morality is a joke so long as th...,1
2,Hello. I am the new guy in charge here (Beside...,2
3,You could take any one cell in my body and kil...,3
4,"Personally, I think it's infinite. We will alw...",4
...,...,...
2000377,Hello and thanks for reading this. I'm sorry f...,2000377
2000378,Having to go to work everyday while depressed ...,2000378
2000379,I just wanna fuckin die I just wanna feel alri...,2000379
2000380,Hey! I wrote a post like this before and recei...,2000380
