# 04 Prepare Data For Modeling

### Purpose of Notebook
- Split features from text data
- Scale feature data
- Tokenize text data
- Combine feature and text data
- Export X data for use later in workflow

## Imports

In [14]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from string import punctuation

## Pull in X_train and X_test data

In [15]:
with open('../Data/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

with open('../Data/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)

## Create X DataFrame without Text Data 

In [16]:
X_train_features = X_train.drop(labels=['selftext','title'], axis=1).copy()
X_test_features = X_test.drop(labels=['selftext','title'], axis=1).copy()

## Fit Standard Scalar and Transform Feature Data

In [17]:
ss = StandardScaler()
ss.fit(X_train_features);

In [18]:
X_train_features_sc = pd.DataFrame(ss.transform(X_train_features), columns=X_train_features.columns)
X_test_features_sc = pd.DataFrame(ss.transform(X_test_features), columns=X_test_features.columns)

## Save Standard Scaler for later use

In [19]:
with open('../Objects/ss.pkl', 'wb') as f:
    pickle.dump(ss, f)

## Extract Text Data

In [20]:
X_train_text = list(X_train.title + ' ' + X_train.selftext)
X_test_text = list(X_test.title + ' ' + X_test.selftext)

## Setup additional stopwords

In [21]:
_stopwords = set(list(ENGLISH_STOP_WORDS)+list(punctuation))

## Fit Vectorizer and Transform Text Data

In [22]:
tvec = TfidfVectorizer(stop_words=_stopwords, min_df=2, max_df=0.5, ngram_range=(1,2))
tvec.fit(X_train_text);

In [23]:
X_train_text_sp = pd.SparseDataFrame(tvec.transform(X_train_text), 
                                      columns = tvec.get_feature_names(), default_fill_value=0)

X_test_text_sp = pd.SparseDataFrame(tvec.transform(X_test_text), 
                                      columns = tvec.get_feature_names(), default_fill_value=0)

## Save Vectorizer for later use

In [24]:
with open('../Objects/tvec.pkl','wb') as f:
    pickle.dump(tvec, f)

## Combine Features with Text Data
### Try to leave out additional features to see if it will improve the model

In [25]:
feature_cols = X_train_features_sc.columns

X_train_text_sp[feature_cols] = X_train_features_sc[feature_cols]
X_test_text_sp[feature_cols] = X_test_features_sc[feature_cols]

## Export Processed Feature Data

In [26]:
with open('../Data/X_train_clean.pkl', 'wb') as f:
    pickle.dump(X_train_text_sp, f)
    
with open('../Data/X_test_clean.pkl', 'wb') as f:
    pickle.dump(X_test_text_sp, f)