# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp
                  
# Spacy Installs: 

   - python -m spacy download en_core_web_lg

        - python -m spacy link en_core_web_lg en

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
nlp = spacy.load('en')

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')
# Look inside the bucket.
bucket.contents

['API/',
 'API/api.py',
 'API/api_exploration.ipynb',
 'Environments/',
 'Environments/environment.yml',
 'Flask_App/',
 'Flask_App/Pipfile',
 'Flask_App/__init__.py',
 'Flask_App/app.py',
 'Flask_App/models.py',
 'Flask_App/yelp.py',
 'Model/',
 'Model/vect_1.sav',
 'datasets/',
 'datasets/df.csv',
 'datasets/dtm.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/official_NB.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [None]:
### DO NOT RUN #### 
### ALREADY INSTALLED ###

# Only have to run this once.
# Installs the .csv 'Locally' on SageMaker Instance

#bucket.get('datasets/df.csv', 'df.csv')

    # Load in Bucket
# bucket = s3.Bucket('yelpchallenge1')
    # Look inside the bucket.
# bucket.contents

# Getting Started

In [5]:
# Read-in df.csv
df = pd.read_csv('df.csv')
# Dropping Column
df = df.drop(columns=['Unnamed: 0'])
# Dropping all Missing / Na Values from Entire Dataframe
df = df.dropna()

In [6]:
# Checking Null Values and Shape
print(df.isna().sum())
print(df.shape)

stars          0
text           0
date           0
total_votes    0
tokens         0
dtype: int64
(6685874, 5)


In [41]:
# Read-in dtm.csv (Original)
dtm = pd.read_csv('dtm.csv')
dtm = dtm.drop(columns=['Unnamed: 0'])

# Read-in dtm2.csv (New)
dtm2 = pd.read_csv('dtm2.csv')
dtm2 = dtm2.drop(columns=['Unnamed: 0'])
dtm2.head()

Unnamed: 0,stars,Unnamed: 2,Unnamed: 3,-PRON-,i,the,this,$,'s,-PRON-.1,...,work,worth,would,would be,would have,would not,wrong,year,yelp,yet
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397408,0.0,0.069829,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.0,0.400392,0.0,0.158778,0.0,0.134316,0.0,0.0,0.082813,0.343331,...,0.0,0.052035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.218785,0.0,0.0,0.165937,0.085626,0.0,0.081237,0.0,0.314034,...,0.084567,0.0,0.0,0.0,0.0,0.0,0.0,0.093163,0.0,0.0
3,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080641,...,0.0,0.0,0.087372,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018564,0.515313,...,0.0,0.0,0.029004,0.0,0.0,0.02601,0.027762,0.021839,0.0,0.028812


In [37]:
# import Vectorizer models
vect = joblib.load('vect_1.sav')
vect2 = joblib.load('vect_2.sav')

# Data Merging

In [44]:
            # ***** New DTM DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #

# Taking Stars Column
#stars = df['stars']

# Adding stars column to dtm
#dtm['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
#cols = list(dtm.columns)
#cols = [cols[-1]] + cols[:-1]
#dtm = dtm[cols]

# Dropping "-PRON-", 'year -PRON-', and ' ' Columns
#dtm = dtm.drop(columns=[' ', '  -PRON-', 'year -PRON-'])


In [43]:
             # ***** New DTM2 DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #
# Taking Stars Column
#stars = df['stars']

# Adding stars column to dtm
#dtm2['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
#cols = list(dtm2.columns)
#cols = [cols[-1]] + cols[:-1]
#dtm2 = dtm2[cols]

dtm2 = dtm2.drop(columns=['stars'])

In [57]:
dtm2.columns.tolist()

['  -PRON-',
 '  i',
 '  the',
 '  this',
 '$',
 "'s",
 '-PRON-',
 '-PRON-  ',
 '-PRON- -PRON-',
 '-PRON- a',
 '-PRON- all',
 '-PRON- also',
 '-PRON- and',
 '-PRON- be',
 '-PRON- but',
 '-PRON- can',
 '-PRON- come',
 '-PRON- could',
 '-PRON- do',
 '-PRON- favorite',
 '-PRON- feel',
 '-PRON- first',
 '-PRON- food',
 '-PRON- for',
 '-PRON- friend',
 '-PRON- get',
 '-PRON- go',
 '-PRON- have',
 '-PRON- husband',
 '-PRON- i',
 '-PRON- in',
 '-PRON- just',
 '-PRON- know',
 '-PRON- look',
 '-PRON- make',
 '-PRON- need',
 '-PRON- order',
 '-PRON- out',
 '-PRON- own',
 '-PRON- say',
 '-PRON- should',
 '-PRON- take',
 '-PRON- that',
 '-PRON- the',
 '-PRON- to',
 '-PRON- want',
 '-PRON- will',
 '-PRON- with',
 '-PRON- would',
 '1',
 '10',
 '15',
 '2',
 '20',
 '3',
 '30',
 '4',
 '5',
 '6',
 'a',
 'a bit',
 'a couple',
 'a few',
 'a good',
 'a great',
 'a little',
 'a lot',
 'a nice',
 'a very',
 'able',
 'able to',
 'about',
 'about -PRON-',
 'about the',
 'absolutely',
 'actually',
 'add',
 'aft

In [55]:
# Dropping columns: 
dtm2 = dtm2.drop(columns=[' ' , '  '])

In [56]:
dtm2.head()

Unnamed: 0,-PRON-,i,the,this,$,'s,-PRON-.1,-PRON-.2,-PRON- -PRON-,-PRON- a,...,work,worth,would,would be,would have,would not,wrong,year,yelp,yet
0,0.0,0.0,0.0,0.0,0.397408,0.0,0.069829,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.158778,0.0,0.134316,0.0,0.0,0.082813,0.343331,0.0,0.0,0.0,...,0.0,0.052035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.165937,0.085626,0.0,0.081237,0.0,0.314034,0.0,0.0,0.0,...,0.084567,0.0,0.0,0.0,0.0,0.0,0.0,0.093163,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.080641,0.0,0.0,0.0,...,0.0,0.0,0.087372,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.018564,0.515313,0.0,0.020392,0.0,...,0.0,0.0,0.029004,0.0,0.0,0.02601,0.027762,0.021839,0.0,0.028812


In [59]:
dtm2 = dtm2.drop(columns=['  -PRON-','  i',  '  the',  '  this', '$', "'s"])

In [60]:
dtm2.head()

Unnamed: 0,-PRON-,-PRON-.1,-PRON- -PRON-,-PRON- a,-PRON- all,-PRON- also,-PRON- and,-PRON- be,-PRON- but,-PRON- can,...,work,worth,would,would be,would have,would not,wrong,year,yelp,yet
0,0.069829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.343331,0.0,0.0,0.0,0.0,0.0,0.047998,0.122973,0.0,0.0,...,0.0,0.052035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.314034,0.0,0.0,0.0,0.0,0.0,0.0,0.039198,0.0,0.086683,...,0.084567,0.0,0.0,0.0,0.0,0.0,0.0,0.093163,0.0,0.0
3,0.080641,0.0,0.0,0.0,0.0,0.0,0.0,0.110721,0.0,0.0,...,0.0,0.0,0.087372,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.515313,0.0,0.020392,0.0,0.0,0.0,0.043038,0.027566,0.028836,0.02032,...,0.0,0.0,0.029004,0.0,0.0,0.02601,0.027762,0.021839,0.0,0.028812


In [61]:
# Saving dtm2.csv 
dtm2.to_csv(index=False)
dtm2.to_csv(r'dtm2.csv')

# Concatenating