# Reddit Analysis using Pushshift API

## Part 2: Data Cleaning and Preliminary Analysis
- [Data Cleaning](#Data-Cleaning)
- [Preliminary Analysis](#Preliminary-Analysis)

In [93]:
# Import libaries
import pandas as pd
import numpy as np
import re
import string

In [94]:
# Import csv files
dfN = pd.read_csv('../data/narcissists.csv', index_col = 0, )
dfR = pd.read_csv('../data/retail.csv', index_col = 0)

# Data Cleaning

In [95]:
# Reduce data into relevant data
df_cols = ['id', 'title', 'selftext', 'subreddit', 'num_comments']

# Drop posts with removed or null text
dfN = dfN[df_cols]
dfN = dfN[dfN['selftext'] != '[removed]']
dfN.dropna(inplace=True)

dfR = dfR[df_cols]
dfR = dfR[dfR['selftext'] != '[removed]']
dfR.dropna(inplace=True);

In [96]:
# Check for nulls and dtypes
dfN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 927 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            927 non-null    object
 1   title         927 non-null    object
 2   selftext      927 non-null    object
 3   subreddit     927 non-null    object
 4   num_comments  927 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 43.5+ KB


In [97]:
# Check for nulls and dtypes
dfR.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 987 entries, 1 to 4495
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            987 non-null    object
 1   title         987 non-null    object
 2   selftext      987 non-null    object
 3   subreddit     987 non-null    object
 4   num_comments  987 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 46.3+ KB


In [98]:
# Get combined 
df = pd.concat([dfR, dfN]).reset_index(drop = True)

# Binarize the subreddit column to function as the target
df['subreddit'] = df['subreddit'].map(lambda x: 1 if x == 'TalesFromRetail' else 0)

df['subreddit'].value_counts(normalize = True) # 51% is baseline

1    0.515674
0    0.484326
Name: subreddit, dtype: float64

In [99]:
# Get column with combined full text
df['text'] = df['title'] + [' ' for i in range(len(df['title']))] + df['selftext']
df = df.drop('title', axis = 1)
df = df.drop('selftext', axis = 1)

# Get additional columns
df['text_len'] = [len(text) for text in df['text']]
df['word_count'] = [len(text.split(' ')) for text in df['text']]

In [100]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [101]:
# Clean text punctuation
df['text'] = [text.replace('\n', '') for text in df.text]
df['text'] = [text.replace("'", '') for text in df.text]
for char in string.punctuation:
    df['text'] = [text.replace(char, ' ') for text in df.text]
    
# Remove all numbers
df['text'] = [re.sub('[0-9]+', '', text).strip() for text in df.text]

# Drop any duplicates
df = df.drop_duplicates('text')

In [102]:
# Reorganize subreddit dataframes for easier access
dfR = df[df['subreddit'] == 1].reset_index(drop = True)
dfN = df[df['subreddit'] != 1].reset_index(drop = True)

# Preliminary Analysis

In [103]:
dfN.describe()

Unnamed: 0,subreddit,num_comments,text_len,word_count
count,923.0,923.0,923.0,923.0
mean,0.0,10.570964,1676.583965,318.728061
std,0.0,34.191151,1818.098432,346.976258
min,0.0,0.0,11.0,2.0
25%,0.0,2.0,606.5,117.5
50%,0.0,4.0,1190.0,225.0
75%,0.0,8.0,2067.0,389.0
max,0.0,764.0,18690.0,3687.0


In [104]:
dfR.describe()

Unnamed: 0,subreddit,num_comments,text_len,word_count
count,959.0,959.0,959.0,959.0
mean,1.0,44.284672,1942.089677,369.955162
std,0.0,62.2454,1124.518814,214.612694
min,1.0,0.0,20.0,2.0
25%,1.0,7.0,1139.5,216.5
50%,1.0,22.0,1650.0,313.0
75%,1.0,54.0,2467.0,472.0
max,1.0,602.0,10149.0,2006.0


In [107]:
df.shape, dfN.shape, dfR.shape

((1882, 6), (923, 6), (959, 6))

In [105]:
# Example text from r/raisedbynarcissists
text = dfN['text'][4]
text.strip()

'In a couple hours Im going to talk to someone outside of my family about my Ndad for the first time  In pretty nervous and kind of scared  My dad is very nice to everyone except for my family  Everyone likes him and people always tell me how lucky I am to have such a great dad  This is one of the many reasons why Ive never told anyone  but Im going to do it now One of the things that Im scared of is that they wont believe me  My dad always tell me that no one cares about me and they just pretend to tolerate me to be nice  Plus  wow  a teenager complaining about one of their parents  What a surprise  In my head Im pretty sure theyll believe me because theyre a good person  but my whole life of getting threatened so I dont tell anyone and  well  all the other stuff hes done has me pretty scared Im also scared that I wont be able to explain things properly  Ive only talked about this to my siblings before and they know already so theres no need to explain the abuse  Im always really bad 

In [106]:
# Example text from r/TalesFromRetail
dfR['text'][0]

'From The Lights to The Darkness There was a man named John who was the Minister of Defense and he had six children and his wife was pregnant with the seventh and the time of her birth came and on the day she gave birth to the seventh her baby died while she was giving birth to him  so she broke down and she was depressed and severely sad for many months One day  she saw people adopting a child in the orphanage  so she ran to his wife and called him Elizabeth  John  I want to adopt a child from the orphanage  and I want you to come right away John  Really   Isnt the death of our baby enough for you and dont forget that we have  children and we can take care of them Elizabeth  But I want to make an orphan happy and fulfill the promise of his mother and father  and I want them to rest these poor orphans John  Its okay  Ill come to you He reached the orphanage and saw the child who wanted him  and his skin color was tan  so he was disgusted with him  John said   You only searched for this

In [108]:
# Export cleaned data
df.to_csv('../data/cleaned.csv')
dfR.to_csv('../data/clean_retail.csv')
dfN.to_csv('../data/clean_narc.csv')