# Perform EDA on data captured in DB

In [5]:
import pandas as pd

In [6]:
comments_df = pd.read_csv("data/comments.csv", index_col=0)

In [7]:
comments_df.shape

(4528, 8)

In [8]:
comments_df.describe(include = "all")

Unnamed: 0,insert_update_ts,id,link_id,author,body,score,permalink,retrieved
count,4528,4528,4528,4528.0,4528,4528.0,4528,4528
unique,416,4528,241,3873.0,4403,,4528,416
top,2020-05-17 15:23:06,fqv8v36,t3_gle82k,,b'[deleted]',,/r/worldnews/comments/gle82k/coal_industry_wil...,2020-05-17 19:30:21
freq,20,1,768,90.0,47,,1,20
mean,,,,,,35.912102,,
std,,,,,,474.892367,,
min,,,,,,-159.0,,
25%,,,,,,1.0,,
50%,,,,,,1.0,,
75%,,,,,,4.0,,


## NaN or NULL values are indicated with body = "b'[deleted]'"

In [9]:
comments_df.isna().sum()

insert_update_ts    0
id                  0
link_id             0
author              0
body                0
score               0
permalink           0
retrieved           0
dtype: int64

In [10]:
deleted_body_index = comments_df[comments_df["body"]== "b'[deleted]'"].index

In [11]:
no_author_index = comments_df[comments_df["author"]== "None"].index

> ### Also, accounts that have been deleted or (possibly) suspended in some way show up with "author" set to "None"

In [12]:
deleted_body_index.isin(no_author_index).sum()

47

> ### In fact, all of the current records with deleted body content are a subset of the records with no author. In order to cover both possible scenarios we'll filter out records where the body content has been deleted and where the author has been set to "None".

In [13]:
comments_df = comments_df[(comments_df["author"] != "None")&(comments_df["body"] != "b'[deleted]'")]

In [14]:
comments_df.shape

(4438, 8)

## Add linguistic features

In [15]:
import textstat

In [16]:
comments_df["body_len"] = comments_df.apply(lambda x: len(x["body"]), axis = 1)

In [17]:
comments_df["sent_count"] = comments_df.apply(lambda x: textstat.sentence_count(x["body"]), axis = 1)
comments_df["flesch_read"] = comments_df.apply(lambda x: textstat.flesch_reading_ease(x["body"]), axis = 1)
comments_df["difficult_words"] = comments_df.apply(lambda x: textstat.difficult_words(x["body"]), axis = 1)
comments_df["read_index"] = comments_df.apply(lambda x: textstat.automated_readability_index(x["body"]), axis = 1)
comments_df["syllable_count"] = comments_df.apply(lambda x: textstat.syllable_count(x["body"]), axis = 1)
comments_df["text_standard"] = comments_df.apply(lambda x: textstat.text_standard(x["body"], float_output = True), axis=1)

In [18]:
comments_df.sample(3)

Unnamed: 0,insert_update_ts,id,link_id,author,body,score,permalink,retrieved,body_len,sent_count,flesch_read,difficult_words,read_index,syllable_count,text_standard
2909,2020-05-17 15:54:43,fqx1zoy,t3_glfmzd,Right-Comfort,"b""But how can we blame this on fossil fuels to...",-45,/r/worldnews/comments/glfmzd/the_magnetic_nort...,2020-05-17 19:54:43,118,1,75.54,6,11.1,28,9.0
2529,2020-05-17 15:45:11,fqwemup,t3_gl4zny,big-boy-matt,b'thank god we can continue murdering babies',-17,/r/news/comments/gl4zny/group_buys_alabama_abo...,2020-05-17 19:45:11,45,1,55.91,3,8.3,12,11.0
1246,2020-05-17 15:30:12,fqug98j,t3_gku0f4,nice2yz,b'Lower down a platform on the side.',1,/r/news/comments/gku0f4/jeff_bezos_ceo_of_amaz...,2020-05-17 19:30:12,37,1,89.75,1,2.9,9,3.0
