# 03 Prepare X Feature Data 

### Purpose of Notebook
- Split features from text data
- Scale feature data
- Tokenize text data
- Combine feature and text data
- Export X data for use later in workflow

## Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from string import punctuation

## Pull in X_train and X_test data

In [2]:
with open('../Data/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

with open('../Data/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)

## Create X DataFrame without Text Data 

In [3]:
X_train_features = X_train.drop(labels=['selftext','title','author','created_utc'], axis=1).copy()
X_test_features = X_test.drop(labels=['selftext','title','author','created_utc'], axis=1).copy()

## Fit Standard Scalar and Transform Feature Data

In [4]:
ss = StandardScaler()
ss.fit(X_train_features);

In [5]:
X_train_features_sc = pd.DataFrame(ss.transform(X_train_features), columns=X_train_features.columns)
X_test_features_sc = pd.DataFrame(ss.transform(X_test_features), columns=X_test_features.columns)

## Save Standard Scalar for later use

In [6]:
with open('../Objects/ss.pkl', 'wb') as f:
    pickle.dump(ss, f)

## Extract Text Data

In [7]:
X_train_text = list(X_train.title + ' ' + X_train.selftext)
X_test_text = list(X_test.title + ' ' + X_test.selftext)

## Setup additional stopwords

In [8]:
_stopwords = set(list(ENGLISH_STOP_WORDS)+list(punctuation))

## Fit Vectorizer and Transform Text Data

In [9]:
tvec = TfidfVectorizer(stop_words=_stopwords, min_df=2, max_df=0.5, ngram_range=(1,2))
tvec.fit(X_train_text);

In [10]:
X_train_text_sp = pd.SparseDataFrame(tvec.transform(X_train_text), 
                                      columns = tvec.get_feature_names(), default_fill_value=0)

X_test_text_sp = pd.SparseDataFrame(tvec.transform(X_test_text), 
                                      columns = tvec.get_feature_names(), default_fill_value=0)

## Try count vectorizser

In [None]:
cvec = CountVectorizer(stop_words=_stopwords)


## Save Vectorizer for later use

In [280]:
with open('../Objects/tvec.pkl','wb') as f:
    pickle.dump(tvec, f)

## Combine Features with Text Data
### Try to leave out additional features to see if it will improve the model

In [281]:
feature_cols = X_train_features_sc.columns

X_train_text_sp[feature_cols] = X_train_features_sc[feature_cols]
X_test_text_sp[feature_cols] = X_test_features_sc[feature_cols]

## Export Processed X Data

In [282]:
with open('../Data/X_train_clean.pkl', 'wb') as f:
    pickle.dump(X_train_text_sp, f)
    
with open('../Data/X_test_clean.pkl', 'wb') as f:
    pickle.dump(X_test_text_sp, f)

In [283]:
#X_train_text_sp.head()

----------------  Try some new stuff -------------------

In [284]:
X_train_text_sp.iloc[:,10:30].head()

Unnamed: 0,able ride,able share,able swimming,aboard,aboard boat,abruptly,absent,absolute,absolute shit,absolutely,absolutely love,abuse,abused,abusive,accept,acceptable,accepted,access,accessible,accident
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [285]:
(indices)

array([9345, 4059, 4040, ..., 4394, 3935, 8498])

In [286]:
indices = np.argsort(tvec.idf_)[::-1]

In [287]:
len(indices)

8703

In [288]:
features = tvec.get_feature_names()
top_n = 20
top_features = [features[i] for i in indices[:top_n]]
print(top_features)

['zoo', 'katie', 'just tiny', 'just tired', 'just told', 'just trying', 'just wait', 'just went', 'just wish', 'just work', 'karen', 'keeping secret', 'just stupid', 'kept distance', 'kept eating', 'kept enduring', 'kept going', 'kept secret', 'key representative', 'kicking']


In [289]:
list(zip(features, indices))

[('abandoned', 8702),
 ('ability', 3783),
 ('ability able', 3765),
 ('ability noticed', 3766),
 ('able', 3767),
 ('able achieve', 3769),
 ('able house', 3771),
 ('able income', 3776),
 ('able learn', 3777),
 ('able make', 3779),
 ('able ride', 3781),
 ('able share', 3785),
 ('able swimming', 3758),
 ('aboard', 3787),
 ('aboard boat', 3788),
 ('abruptly', 3789),
 ('absent', 3790),
 ('absolute', 3791),
 ('absolute shit', 3793),
 ('absolutely', 3796),
 ('absolutely love', 3798),
 ('abuse', 3800),
 ('abused', 3759),
 ('abusive', 3757),
 ('accept', 3802),
 ('acceptable', 3740),
 ('accepted', 3723),
 ('access', 3726),
 ('accessible', 3727),
 ('accident', 3730),
 ('accident today', 3733),
 ('accident wa', 3736),
 ('accidentally', 3737),
 ('accommodate', 3738),
 ('accommodate yuri', 3739),
 ('accompanied', 3741),
 ('accompanied picture', 3755),
 ('according', 3742),
 ('account', 3743),
 ('account post', 3744),
 ('account started', 3746),
 ('accounting', 3747),
 ('accusation', 3750),
 ('accused

In [290]:
pd.Series(indices, index=features)

abandoned          8702
ability            3783
ability able       3765
ability noticed    3766
able               3767
able achieve       3769
able house         3771
able income        3776
able learn         3777
able make          3779
able ride          3781
able share         3785
able swimming      3758
aboard             3787
aboard boat        3788
abruptly           3789
absent             3790
absolute           3791
absolute shit      3793
absolutely         3796
absolutely love    3798
abuse              3800
abused             3759
abusive            3757
accept             3802
acceptable         3740
accepted           3723
access             3726
accessible         3727
accident           3730
                   ... 
yuri landed        7179
yuri life          8471
yuri little        2800
yuri lived         4435
yuri looked        1606
yuri making        6174
yuri managed       8275
yuri month         2765
yuri natural       1580
yuri noticed       2831
yuri overcame   

In [291]:
tvec.vocabulary_

{'mother': 4840,
 'cooking': 1219,
 'dinner': 1660,
 'son': 6711,
 'come': 1082,
 'running': 6147,
 'say': 6236,
 'mom': 4762,
 'grandma': 2889,
 'drunk': 1889,
 'living': 4327,
 'room': 6107,
 'ha': 2993,
 'heaven': 3189,
 'talking': 7132,
 'walking': 8131,
 'lying': 4490,
 'completely': 1130,
 'naked': 4891,
 'look': 4377,
 'point': 5476,
 'grandmother': 2890,
 'vagina': 7779,
 'oh': 5073,
 'tasted': 7151,
 'like': 4136,
 'come running': 1090,
 'running say': 6148,
 'mom grandma': 4764,
 'living room': 4328,
 'room ha': 6108,
 'talking say': 7138,
 'completely naked': 1133,
 'grandmother vagina': 2891,
 'son say': 6713,
 'say oh': 6263,
 'tasted like': 7152,
 'just': 3674,
 'waste': 8247,
 'space': 6744,
 'want': 8140,
 'watch': 8252,
 'movie': 4871,
 'spider': 6781,
 'man': 4554,
 'hit': 3253,
 'easy': 1927,
 'dark': 1374,
 'web': 8312,
 'want watch': 8199,
 'watch movie': 8253,
 'easy just': 1929,
 'dark web': 1376,
 'couple': 1261,
 'playing': 5466,
 'poker': 5486,
 'evening': 207