In [25]:
import pandas as pd
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Load the datasets
comments_df = pd.read_csv('the-reddit-dataset-dataset-comments.csv')
posts_df = pd.read_csv('the-reddit-dataset-dataset-posts.csv')

In [5]:
# Display the first few rows of each dataset
print("Comments Dataset:")
comments_df.head()

Comments Dataset:


Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score
0,comment,hyyz6g8,2r97t,datasets,False,1646173750,https://old.reddit.com/r/datasets/comments/t45...,Spatial problem: Suitability of new locations ...,0.0772,1
1,comment,hyyid7v,2r97t,datasets,False,1646167280,https://old.reddit.com/r/datasets/comments/sg9...,Have you tried toying around with GDELT or Ali...,0.0,2
2,comment,hyxp1qp,2r97t,datasets,False,1646156018,https://old.reddit.com/r/datasets/comments/t44...,Damn random internet person of whom I know not...,-0.3851,3
3,comment,hyxgnyu,2r97t,datasets,False,1646152782,https://old.reddit.com/r/datasets/comments/t44...,Ah nice one. Best of luck with the baby. If yo...,0.9136,3
4,comment,hyxfjw6,2r97t,datasets,False,1646152345,https://old.reddit.com/r/datasets/comments/t49...,I was about to write and say this shouldn't be...,0.0762,2


In [6]:
print("\nPosts Dataset:")
posts_df.head()


Posts Dataset:


Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score
0,post,t4f9bf,2r97t,datasets,False,1646160815,https://old.reddit.com/r/datasets/comments/t4f...,pravda.com.ua,,[removed],[ Removed by Reddit ],7
1,post,t4euxw,2r97t,datasets,False,1646159793,https://old.reddit.com/r/datasets/comments/t4e...,self.datasets,,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio...",1
2,post,t4e0bb,2r97t,datasets,False,1646157650,https://old.reddit.com/r/datasets/comments/t4e...,self.datasets,,[removed],Help scraping a store locator from a website?,1
3,post,t49fq0,2r97t,datasets,False,1646145753,https://old.reddit.com/r/datasets/comments/t49...,self.datasets,,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,1
4,post,t47wiw,2r97t,datasets,False,1646141275,https://old.reddit.com/r/datasets/comments/t47...,self.datasets,,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...,2


In [7]:
# Display the number of rows in each dataset
print("Number of rows in Comments Dataset:", comments_df.shape[0])
print("Number of rows in Posts Dataset:", posts_df.shape[0])

Number of rows in Comments Dataset: 54848
Number of rows in Posts Dataset: 20292


In [13]:
# Rename the posts dataset to df
df = posts_df

In [14]:
# Remove the specified columns
columns_to_remove = ['type', 'subreddit.id', 'subreddit.nsfw', 'permalink', 'domain', 'url', 'score', 'subreddit.name']
df = df.drop(columns=columns_to_remove)
df.head()

Unnamed: 0,id,created_utc,selftext,title
0,t4f9bf,1646160815,[removed],[ Removed by Reddit ]
1,t4euxw,1646159793,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio..."
2,t4e0bb,1646157650,[removed],Help scraping a store locator from a website?
3,t49fq0,1646145753,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name
4,t47wiw,1646141275,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...


In [15]:
# Remove rows where 'selftext' is equal to '[removed]' or empty
df = df[~df['selftext'].isin(['[removed]', ''])]

# Print the number of remaining rows
print("Number of rows after removal:", df.shape[0])

Number of rows after removal: 18361


In [16]:
df.head()

Unnamed: 0,id,created_utc,selftext,title
1,t4euxw,1646159793,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio..."
3,t49fq0,1646145753,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name
4,t47wiw,1646141275,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...
7,t45uk7,1646134122,I’m looking for a dataset that I can use to id...,[request] looking for a dataset that i can use...
8,t45muj,1646133318,,"Personal data of 120,000 Russian servicemen fi..."


In [None]:
# Convert 'created_utc' to datetime
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from 'created_utc'
df['year'] = df['created_utc'].dt.year

In [18]:
df.head()

Unnamed: 0,id,created_utc,selftext,title,year
1,t4euxw,2022-03-01 18:36:33,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio...",2022
3,t49fq0,2022-03-01 14:42:33,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,2022
4,t47wiw,2022-03-01 13:27:55,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...,2022
7,t45uk7,2022-03-01 11:28:42,I’m looking for a dataset that I can use to id...,[request] looking for a dataset that i can use...,2022
8,t45muj,2022-03-01 11:15:18,,"Personal data of 120,000 Russian servicemen fi...",2022


In [19]:
# Print the unique values of year and their frequency
year_counts = df['year'].value_counts()
print(year_counts)

year
2020    4171
2019    3049
2021    3021
2018    1787
2017    1730
2016    1365
2015    1138
2014     812
2022     409
2013     404
2012     181
2011     180
2010     114
Name: count, dtype: int64


In [20]:
# Remove the 'created_utc' column
df = df.drop(columns=['created_utc'])
df.head()

Unnamed: 0,id,selftext,title,year
1,t4euxw,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio...",2022
3,t49fq0,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,2022
4,t47wiw,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...,2022
7,t45uk7,I’m looking for a dataset that I can use to id...,[request] looking for a dataset that i can use...,2022
8,t45muj,,"Personal data of 120,000 Russian servicemen fi...",2022


In [23]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

In [24]:
def extract_important_words(text):
    doc = nlp(text)
    important_words = [token.text for token in doc if token.is_alpha and not token.is_stop]
    return ';'.join(important_words)

# Apply the function to the 'title' column and create a new column 'important_words'
df['important_words'] = df['title'].apply(extract_important_words)
df.head()

Unnamed: 0,id,selftext,title,year,important_words
1,t4euxw,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Cool;Cloud;Data;Lake;conference;AWS;Dremio;Mic...
3,t49fq0,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,2022,letter;country;code;country
4,t47wiw,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...,2022,Looking;datasets;contain;images;contracts;pape...
7,t45uk7,I’m looking for a dataset that I can use to id...,[request] looking for a dataset that i can use...,2022,request;looking;dataset;use;analyse;uni;project
8,t45muj,,"Personal data of 120,000 Russian servicemen fi...",2022,Personal;data;Russian;servicemen;fighting;Ukra...


In [26]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [30]:
def calculate_sentiment(row):
    text = row['selftext']
    if pd.isna(text):
        text = row['title']
    sentiment_score = analyzer.polarity_scores(text)['compound']
    # Normalize the sentiment score to range from 0 to 1
    normalized_score = (sentiment_score + 1) / 2
    return normalized_score

def classify_sentiment(score):
    if score < 0.4:
        return 'Negative'
    elif 0.4 <= score <= 0.6:
        return 'Neutral'
    else:
        return 'Positive'

# Calculate the sentiment score and classify it
df['sentiment_score'] = df.apply(calculate_sentiment, axis=1)
df['sentiment'] = df['sentiment_score'].apply(classify_sentiment)
df.head()

Unnamed: 0,id,selftext,title,year,important_words,sentiment_score,sentiment
1,t4euxw,Wanted to share this conference on Cloud Data ...,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Cool;Cloud;Data;Lake;conference;AWS;Dremio;Mic...,0.6868,Positive
3,t49fq0,"Hi, I have a dataset with countries by 3 lette...",3 letter country code to full country name,2022,letter;country;code;country,0.8238,Positive
4,t47wiw,I want to classify if an image contains a cont...,Looking for datasets that contain images of co...,2022,Looking;datasets;contain;images;contracts;pape...,0.918,Positive
7,t45uk7,I’m looking for a dataset that I can use to id...,[request] looking for a dataset that i can use...,2022,request;looking;dataset;use;analyse;uni;project,0.77485,Positive
8,t45muj,,"Personal data of 120,000 Russian servicemen fi...",2022,Personal;data;Russian;servicemen;fighting;Ukra...,0.3194,Negative


In [31]:
df_temp = df

In [None]:
###

In [None]:
###

In [None]:
###

In [41]:
#df = df_temp

In [42]:
# Remove the 'sentiment', 'title', and 'selftext' columns
df = df.drop(columns=['sentiment', 'title', 'selftext', 'id'])
df.head()

Unnamed: 0,year,important_words,sentiment_score
1,2022,Cool;Cloud;Data;Lake;conference;AWS;Dremio;Mic...,0.6868
3,2022,letter;country;code;country,0.8238
4,2022,Looking;datasets;contain;images;contracts;pape...,0.918
7,2022,request;looking;dataset;use;analyse;uni;project,0.77485
8,2022,Personal;data;Russian;servicemen;fighting;Ukra...,0.3194


In [43]:
# Split the 'important_words' column into separate columns
important_words_split = df['important_words'].str.split(';', expand=True)

# Rename the new columns to 'word1', 'word2', ..., 'wordn'
important_words_split.columns = [f'word{i+1}' for i in range(important_words_split.shape[1])]

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, important_words_split], axis=1)

# Drop the original 'important_words' column
df = df.drop(columns=['important_words'])
df.head()

Unnamed: 0,year,sentiment_score,word1,word2,word3,word4,word5,word6,word7,word8,...,word21,word22,word23,word24,word25,word26,word27,word28,word29,word30
1,2022,0.6868,Cool,Cloud,Data,Lake,conference,AWS,Dremio,Microsoft,...,,,,,,,,,,
3,2022,0.8238,letter,country,code,country,,,,,...,,,,,,,,,,
4,2022,0.918,Looking,datasets,contain,images,contracts,paper,printed,text,...,,,,,,,,,,
7,2022,0.77485,request,looking,dataset,use,analyse,uni,project,,...,,,,,,,,,,
8,2022,0.3194,Personal,data,Russian,servicemen,fighting,Ukraine,public,,...,,,,,,,,,,


In [46]:
# Melt the word columns
melted_df = df.melt(id_vars=[col for col in df.columns if not col.startswith('word')],
                    value_vars=[col for col in df.columns if col.startswith('word')],
                    var_name='word_num', value_name='word')

# Drop rows with NaN values in the 'word' column
melted_df = melted_df.dropna(subset=['word'])
melted_df.head()

Unnamed: 0,year,sentiment_score,word_num,word
0,2022,0.6868,word1,Cool
1,2022,0.8238,word1,letter
2,2022,0.918,word1,Looking
3,2022,0.77485,word1,request
4,2022,0.3194,word1,Personal


In [47]:
# Remove the 'word_num' and 'word' columns
melted_df = melted_df.drop(columns=['word_num'])
melted_df.head()

Unnamed: 0,year,sentiment_score,word
0,2022,0.6868,Cool
1,2022,0.8238,letter
2,2022,0.918,Looking
3,2022,0.77485,request
4,2022,0.3194,Personal


In [48]:
# Aggregate each word by year
aggregated_df = melted_df.groupby(['year', 'word']).agg(
    frequency=('word', 'size'),
    sentiment=('sentiment_score', 'mean')
).reset_index()
aggregated_df.head()

Unnamed: 0,year,word,frequency,sentiment
0,2010,,1,0.5
1,2010,AOL,2,0.5
2,2010,API,5,0.50792
3,2010,ARFF,1,0.5
4,2010,AWS,1,0.2798


In [49]:
# Remove rows where 'word' is empty
aggregated_df = aggregated_df[aggregated_df['word'] != '']
aggregated_df.head()

Unnamed: 0,year,word,frequency,sentiment
1,2010,AOL,2,0.5
2,2010,API,5,0.50792
3,2010,ARFF,1,0.5
4,2010,AWS,1,0.2798
5,2010,Access,1,0.5


In [50]:
# Rename the 'word' column to 'text'
aggregated_df = aggregated_df.rename(columns={'word': 'text'})
aggregated_df.head()

Unnamed: 0,year,text,frequency,sentiment
1,2010,AOL,2,0.5
2,2010,API,5,0.50792
3,2010,ARFF,1,0.5
4,2010,AWS,1,0.2798
5,2010,Access,1,0.5


In [51]:
# Export the dataset as a .tsv file
aggregated_df.to_csv('Reddit_sentiment.tsv', sep='\t', index=False)

In [None]:
###

In [None]:
###

In [None]:
###

In [32]:
# Remove the 'selftext' and 'sentiment_score' columns
df = df.drop(columns=['selftext', 'sentiment_score'])
df.head()

Unnamed: 0,id,title,year,important_words,sentiment
1,t4euxw,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Cool;Cloud;Data;Lake;conference;AWS;Dremio;Mic...,Positive
3,t49fq0,3 letter country code to full country name,2022,letter;country;code;country,Positive
4,t47wiw,Looking for datasets that contain images of co...,2022,Looking;datasets;contain;images;contracts;pape...,Positive
7,t45uk7,[request] looking for a dataset that i can use...,2022,request;looking;dataset;use;analyse;uni;project,Positive
8,t45muj,"Personal data of 120,000 Russian servicemen fi...",2022,Personal;data;Russian;servicemen;fighting;Ukra...,Negative


In [33]:
# Split the 'important_words' column into separate columns
important_words_split = df['important_words'].str.split(';', expand=True)

# Rename the new columns to 'word1', 'word2', ..., 'wordn'
important_words_split.columns = [f'word{i+1}' for i in range(important_words_split.shape[1])]

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, important_words_split], axis=1)

# Drop the original 'important_words' column
df = df.drop(columns=['important_words'])
df.head()

Unnamed: 0,id,title,year,sentiment,word1,word2,word3,word4,word5,word6,...,word21,word22,word23,word24,word25,word26,word27,word28,word29,word30
1,t4euxw,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Positive,Cool,Cloud,Data,Lake,conference,AWS,...,,,,,,,,,,
3,t49fq0,3 letter country code to full country name,2022,Positive,letter,country,code,country,,,...,,,,,,,,,,
4,t47wiw,Looking for datasets that contain images of co...,2022,Positive,Looking,datasets,contain,images,contracts,paper,...,,,,,,,,,,
7,t45uk7,[request] looking for a dataset that i can use...,2022,Positive,request,looking,dataset,use,analyse,uni,...,,,,,,,,,,
8,t45muj,"Personal data of 120,000 Russian servicemen fi...",2022,Negative,Personal,data,Russian,servicemen,fighting,Ukraine,...,,,,,,,,,,


In [37]:
# Melt the word columns
melted_df = df.melt(id_vars=[col for col in df.columns if not col.startswith('word')],
                    value_vars=[col for col in df.columns if col.startswith('word')],
                    var_name='word_num', value_name='word')

# Drop rows with NaN values in the 'word' column
melted_df = melted_df.dropna(subset=['word'])
melted_df.head()

Unnamed: 0,id,title,year,sentiment,word_num,word
0,t4euxw,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Positive,word1,Cool
1,t49fq0,3 letter country code to full country name,2022,Positive,word1,letter
2,t47wiw,Looking for datasets that contain images of co...,2022,Positive,word1,Looking
3,t45uk7,[request] looking for a dataset that i can use...,2022,Positive,word1,request
4,t45muj,"Personal data of 120,000 Russian servicemen fi...",2022,Negative,word1,Personal


In [38]:
# Remove the 'word_num' column
melted_df = melted_df.drop(columns=['word_num'])
melted_df.head()

Unnamed: 0,id,title,year,sentiment,word
0,t4euxw,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Positive,Cool
1,t49fq0,3 letter country code to full country name,2022,Positive,letter
2,t47wiw,Looking for datasets that contain images of co...,2022,Positive,Looking
3,t45uk7,[request] looking for a dataset that i can use...,2022,Positive,request
4,t45muj,"Personal data of 120,000 Russian servicemen fi...",2022,Negative,Personal


In [39]:
# Rename the specified columns
melted_df = melted_df.rename(columns={
    'title': 'Title',
    'sentiment': 'Conference',
    'year': 'Year',
    'word': 'Author Names'
})
melted_df.head()

Unnamed: 0,id,Title,Year,Conference,Author Names
0,t4euxw,"Cool Cloud Data Lake conference by AWS, Dremio...",2022,Positive,Cool
1,t49fq0,3 letter country code to full country name,2022,Positive,letter
2,t47wiw,Looking for datasets that contain images of co...,2022,Positive,Looking
3,t45uk7,[request] looking for a dataset that i can use...,2022,Positive,request
4,t45muj,"Personal data of 120,000 Russian servicemen fi...",2022,Negative,Personal


In [40]:
# Export the dataset as a .tsv file
melted_df.to_csv('Reddit_Sentiments.tsv', sep='\t', index=False)