# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp
                  
# Spacy Installs: 

   - python -m spacy download en_core_web_lg

        - python -m spacy link en_core_web_lg en
         


In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
nlp = spacy.load('en_core_web_lg')

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')
# Look inside the bucket.
bucket.contents

['datasets/',
 'datasets/df.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [2]:
### DO NOT RUN #### 
### ALREADY INSTALLED ###

# Only have to run this once.
# Installs the .csv 'Locally' on SageMaker Instance

#bucket.get('datasets/df.csv', 'df.csv')

# Getting Started

In [2]:
# Read-in df.csv
df = pd.read_csv('df.csv')
# Dropping Column
df = df.drop(columns=['Unnamed: 0'])
# Dropping all Missing / Na Values from Entire Dataframe
df = df.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Checking Null Values and Shape
print(df.isna().sum())
print(df.shape)

stars          0
text           0
date           0
total_votes    0
tokens         0
dtype: int64
(6685874, 5)


# Tokenizer

In [52]:
# tokenizer
def tokenize(document):
    doc = nlp(document)
    return [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]

# Vectorizer

In [53]:
#####TEST######
# create mini-dataframe for testing
# want to make sure works locally on small dataset before scaling to entire dataset/AWS
# create variable to feed into TFIDF Vectorizer fit_transform
# to be updated to 'text' column of main dataframe (df['text']) for vectorization in AWS

#mini_df = df.head(10)
#data = mini_df['text']

In [56]:
# create variable to feed into TFIDF Vectorizer fit_transform
# to be updated to 'text' column of main dataframe (df['text']) for vectorization in AWS
data = df['text']

In [58]:
# vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=0.025, max_df=.95, ngram_range=(1,2), stop_words= 'english')

# Create a vocabulary and get word counts per document
    # Learn vocab and transform data into form we want
 # Passing 100,000 Rows of df['text'] through Vectorizer.   
vect = tfidf.fit_transform(data[0:100000])

In [55]:
# View Feature Matrix as DataFrame
# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(vect.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,Unnamed: 1,-PRON-,ask,awesome,big,clearly,come,dental,dr,end,...,wrong make,year,year -PRON-,year add,year admit,year ago,year come,year food,zucchini,zucchini appetizer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.364854,0.128401,0.0,0.048546,0.0,0.0,0.048546,0.0,0.0,0.048546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.187372,0.0,0.0,0.0,0.0,0.0,0.0,0.174518,0.087259,0.0,...,0.0,0.057698,0.087259,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032801,0.021689,0.0,0.0,0.032801,0.0,0.0,0.0,0.0,0.0


In [60]:
# Pickling the Model
from sklearn.externals import joblib
joblib.dump(vect, 'vect_1.sav')

['vect_1.sav']

In [61]:
# import model
vect = joblib.load('vect_1.sav')

In [None]:
dtm.to_csv(index=False)

In [68]:
dtm.to_csv(r'dtm.csv')