# 02 Initial Data Processing

### Purpose of Notebook
- Convert raw JSON data into dataframe
- Remove duplicate posts
- Convert target variable from string to integer
- Light feature engineering (Get length of title and self text)
- Train Test Split data
- Export X and y data for use later in workflow

## Imports & Functions

In [45]:
import pandas as pd
import json
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import regex as re
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup      

In [46]:
def posts_to_df(posts, features = ['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments']):
    feat_dict = [{feat : post['data'][feat] for feat in features}  for post in posts]
    return pd.DataFrame(feat_dict)

## Pull in raw data file

In [47]:
with open('../Data/raw.json', 'r') as f:
    raw = json.load(f)

In [48]:
feature_list = ['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments','score','over_18',
                'score']
df = posts_to_df(raw,features=feature_list)

## Remove duplicates from data

In [49]:
df.drop_duplicates(inplace=True)

## Remove numbers from text data

In [50]:
df['selftext'] = df['selftext'].str.replace('\d+', '')
df['title'] = df['title'].str.replace('\d+', '')

## Text Cleaning Function

In [51]:
p_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(raw_text):  #Bing's copy
    bs_text = BeautifulSoup(raw_text, 'lxml').get_text()
    only_text = re.sub("[^a-zA-Z]", " ", bs_text)
    words = only_text.lower().split()
    #stem_words = [p_stemmer.stem(word) for word in words]
    stem_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(stem_words)
    

In [61]:
df.title = df.title.map(clean_text)
df.selftext = df.selftext.map(clean_text);

In [53]:
df.selftext[4]

'my brother death took a major toll on my father and it took him year to go back to my brother room i believe part of him ha decided to let go of the grief and is trying to renovate my late brother old bedroom i wa helping him move out thing and stumbled upon drawing and poem my brother used to write i looked at the date and it said sept which happens to be about month before the tragic event no one in my family know i found his journal i don t want to bring it up in the case i find his suicide note i took every bit of paper i found and am thinking of going through it today i am unsure if i have the power to look through them i don t know if i am strong enough my mind ha lingered with thought on why he would do that for year i have mixed emotion i want to know what wa going through his head but i do not want the heartbreak of knowing that i couldn t help him sooner almost six year later and i still think of him every single day i miss you so much brother update i want to thank everyone

## Convert target variable (subreddit) to integer

In [54]:
subreddit_map = {'confessions': 0, 'Jokes':1}
df['subreddit_int'] = df['subreddit'].map(subreddit_map)

## Setup target variable y

In [55]:
y = list(df['subreddit_int'])

## Setup feature variables X

In [56]:
X = df.drop(labels=['subreddit','subreddit_int'], axis=1).copy()

## Light feature engineering 
- Since author and create time is not very useful for predicting which subreddit
- Get number of characters for title and text

In [57]:
X['title_len'] = X.title.str.len()
X['text_len'] = X.selftext.str.len()

## Train Test Split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 42)

## Export y_train and y_test objects

In [59]:
with open('../Data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
    
with open('../Data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

## Export X_train and X_test objects

In [60]:
with open('../Data/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)
    
with open('../Data/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)