# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp
                  
# Spacy Installs: 

   - python -m spacy download en_core_web_lg

        - python -m spacy link en_core_web_lg en

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
nlp = spacy.load('en')

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')
# Look inside the bucket.
bucket.contents

['API/',
 'API/api.py',
 'API/api_exploration.ipynb',
 'Environments/',
 'Environments/environment.yml',
 'Flask_App/',
 'Flask_App/Pipfile',
 'Flask_App/__init__.py',
 'Flask_App/app.py',
 'Flask_App/models.py',
 'Flask_App/yelp.py',
 'Model/',
 'Model/vect_1.sav',
 'datasets/',
 'datasets/df.csv',
 'datasets/dtm.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/official_NB.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [None]:
### DO NOT RUN #### 
### ALREADY INSTALLED ###

# Only have to run this once.
# Installs the .csv 'Locally' on SageMaker Instance

#bucket.get('datasets/df.csv', 'df.csv')

# Getting Started

In [2]:
# Read-in df.csv
df = pd.read_csv('df.csv')
# Dropping Column
df = df.drop(columns=['Unnamed: 0'])
# Dropping all Missing / Na Values from Entire Dataframe
df = df.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Checking Null Values and Shape
print(df.isna().sum())
print(df.shape)

stars          0
text           0
date           0
total_votes    0
tokens         0
dtype: int64
(6685874, 5)


In [6]:
# Read-in dtm.csv
dtm = pd.read_csv('dtm.csv')
dtm = dtm.drop(columns=['Unnamed: 0'])
dtm.head()

Unnamed: 0,Unnamed: 1,-PRON-,ask,awesome,big,clearly,come,dental,dr,end,...,year,year -PRON-,year add,year admit,year ago,year come,year food,zucchini,zucchini appetizer,stars
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.364854,0.128401,0.0,0.048546,0.0,0.0,0.048546,0.0,0.0,0.048546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.187372,0.0,0.0,0.0,0.0,0.0,0.0,0.174518,0.087259,0.0,...,0.057698,0.087259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021689,0.0,0.0,0.032801,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# import Vectorizer model
vect = joblib.load('vect_1.sav')

# Data Merging

In [8]:
df.head()

Unnamed: 0,stars,text,date,total_votes,tokens
0,1,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36,7.0,"['total', 'horrible', 'service', 'crooks', 'ac..."
1,5,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33,0.0,"['adore', 'travis', 'hard', 'rock', 'kelly', '..."
2,5,I have to say that this office really has it t...,2016-11-09 20:09:03,3.0,"['office', 'organized', 'friendly', 'phillipp'..."
3,5,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38,0.0,"['went', 'lunch', 'steak', 'sandwich', 'delici..."
4,1,Today was my second out of three sessions I ha...,2018-01-30 23:07:38,7.0,"['today', 'second', 'sessions', 'paid', 'sessi..."


In [58]:
            # ***** New DTM DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #

# Taking Stars Column
# stars = df['stars']

# Adding stars column to dtm
# dtm['stars']=df['stars']
# dtm = dtm.drop(columns=['newcol'])

In [9]:
dtm.head()

Unnamed: 0,Unnamed: 1,-PRON-,ask,awesome,big,clearly,come,dental,dr,end,...,year,year -PRON-,year add,year admit,year ago,year come,year food,zucchini,zucchini appetizer,stars
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.364854,0.128401,0.0,0.048546,0.0,0.0,0.048546,0.0,0.0,0.048546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.187372,0.0,0.0,0.0,0.0,0.0,0.0,0.174518,0.087259,0.0,...,0.057698,0.087259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021689,0.0,0.0,0.032801,0.0,0.0,0.0,0.0,0.0,1.0


In [60]:
dtm['stars']

0    1
1    5
2    5
3    5
4    1
5    4
6    3
7    1
8    2
9    3
Name: stars, dtype: object

In [61]:
# Saving New dtm.csv
dtm.to_csv(index=False)
dtm.to_csv(r'dtm.csv')

# Training Prep: 