# 02B EDA - Reference

### Purpose of Notebook
- Convert raw JSON data into dataframe
- Remove duplicate posts
- Convert target variable from string to integer
- Light feature engineering (Get length of title and self text)
- Train Test Split data
- Export X and y data for use later in workflow

## Imports & Functions

In [2]:
import pandas as pd
import json
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import regex as re
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from string import punctuation

In [3]:
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
def posts_to_df(posts, features = ['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments']):
    feat_dict = [{feat : post['data'][feat] for feat in features}  for post in posts]
    return pd.DataFrame(feat_dict)

## Pull in raw data file

In [5]:
with open('./Data/raw.json', 'r') as f:
    raw = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: './Data/raw.json'

In [None]:
feature_list = ['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments','score','over_18',
                'score']
df = posts_to_df(raw,features=feature_list)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df['title_len'] = df.title.str.len()
df['text_len'] = df.selftext.str.len()

In [None]:
df.tail()

In [None]:
eda = df.groupby('subreddit')['score','num_comments','over_18','title_len','text_len'].mean().copy()
eda

In [None]:
eda['sub'] = eda.index

In [None]:
eda.plot(kind='bar')

In [None]:
fig = plt.figure(figsize=(14,6))
fig.add_subplot(231)
sns.barplot(x='subreddit', y='text_len', data=df)
fig.add_subplot(232)
sns.barplot(x='subreddit', y='over_18', data=df)
fig.add_subplot(233)
sns.barplot(x='subreddit', y='score', data=df)
fig.add_subplot(234)
sns.barplot(x='subreddit', y='num_comments', data=df)
fig.add_subplot(235)
sns.barplot(x='subreddit', y='title_len', data=df)
fig.add_subplot(236)
sns.barplot(x='subreddit', y='title_only', data=df)

plt.tight_layout

## Extract Text Data

In [None]:
text = list(df.title + ' ' + df.selftext)

## Setup additional stopwords

In [None]:
_stopwords = set(list(ENGLISH_STOP_WORDS)+list(punctuation))

## Look at Word Frequency for each subreddit

## Split data into each subreddit

In [None]:
df[df.subreddit=='joke']

In [None]:
confessions_text = list(df[df.subreddit=='confessions'].title + ' ' + df[df.subreddit=='confessions'].selftext)
jokes_text = list(df[df.subreddit=='Jokes'].title + ' ' + df[df.subreddit=='Jokes'].selftext)

In [None]:
cvec_confessions = CountVectorizer(stop_words=_stopwords, min_df=2, max_df=0.5,)
cvec_confessions.fit(confessions_text)
confessions_df_vec = pd.SparseDataFrame(cvec_confessions.transform(confessions_text), 
                                        columns = cvec_confessions.get_feature_names(), 
                                        default_fill_value=0)
pd.DataFrame(confessions_df_vec.sum()).sort_values(0)

In [None]:
cvec_jokes = CountVectorizer(stop_words=_stopwords, min_df=2, max_df=0.5,)
cvec_jokes.fit(jokes_text)
jokes_df_vec = pd.SparseDataFrame(cvec_jokes.transform(jokes_text), 
                                        columns = cvec_jokes.get_feature_names(), 
                                        default_fill_value=0)
pd.DataFrame(jokes_df_vec.sum()).sort_values(0)

In [None]:
cvec = CountVectorizer(stop_words=_stopwords, min_df=2, max_df=0.5,)
cvec.fit(text)
text_vec = pd.SparseDataFrame(cvec.transform(text), 
                                      columns = cvec.get_feature_names(), default_fill_value=0)

In [None]:
pd.DataFrame(text_vec.sum()).sort_values(0)