# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp
                  
# Spacy Installs: 

   - python -m spacy download en_core_web_lg

        - python -m spacy link en_core_web_lg en
         


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
#nlp = spacy.load('en')

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')
# Look inside the bucket.
bucket.contents

['API/',
 'API/api.py',
 'API/api_exploration.ipynb',
 'Environments/',
 'Environments/environment.yml',
 'Flask_App/',
 'Flask_App/Pipfile',
 'Flask_App/__init__.py',
 'Flask_App/app.py',
 'Flask_App/models.py',
 'Flask_App/yelp.py',
 'Model/',
 'Model/vect_1.sav',
 'datasets/',
 'datasets/df.csv',
 'datasets/dtm.csv',
 'datasets/dtm_final.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/official_NB.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [2]:
### DO NOT RUN #### 
### ALREADY INSTALLED ###

# Only have to run this once.
# Installs the .csv 'Locally' on SageMaker Instance

#bucket.get('datasets/df.csv', 'df.csv')

# Getting Started

In [3]:
# Read-in df.csv
df = pd.read_csv('df.csv')
# Dropping Column
df = df.drop(columns=['Unnamed: 0'])
# Dropping all Missing / Na Values from Entire Dataframe
df = df.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Checking Null Values and Shape
print(df.isna().sum())
print(df.shape)

stars          0
text           0
date           0
total_votes    0
tokens         0
dtype: int64
(6685874, 5)


# Tokenizer

In [5]:
# tokenizer
def tokenize(document):
    doc = nlp(document)
    return [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]

# Vectorizer

In [53]:
#####TEST######
# create mini-dataframe for testing
# want to make sure works locally on small dataset before scaling to entire dataset/AWS
# create variable to feed into TFIDF Vectorizer fit_transform
# to be updated to 'text' column of main dataframe (df['text']) for vectorization in AWS

#mini_df = df.head(10)
#data = mini_df['text']

In [6]:
# create variable to feed into TFIDF Vectorizer fit_transform
# to be updated to 'text' column of main dataframe (df['text']) for vectorization in AWS
data = df['text']

In [8]:
# vectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=0.025, max_df=.95, ngram_range=(1,2))

# Create a vocabulary and get word counts per document
    # Learn vocab and transform data into form we want
 # Passing 135,000 Rows of df['text'] through Vectorizer.   
vect2 = tfidf.fit_transform(data[0:135000])


In [17]:
# View Feature Matrix as DataFrame
# Get feature names to use as dataframe column headers
dtm2 = pd.DataFrame(vect2.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm2.head(1000)

Unnamed: 0,Unnamed: 1,Unnamed: 2,-PRON-,i,the,this,$,'s,-PRON-.1,-PRON-.2,...,work,worth,would,would be,would have,would not,wrong,year,yelp,yet
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.397408,0.000000,0.069829,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.400392,0.000000,0.158778,0.000000,0.134316,0.000000,0.000000,0.082813,0.343331,0.000000,...,0.000000,0.052035,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.218785,0.000000,0.000000,0.165937,0.085626,0.000000,0.081237,0.000000,0.314034,0.000000,...,0.084567,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.093163,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.080641,0.000000,...,0.000000,0.000000,0.087372,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018564,0.515313,0.000000,...,0.000000,0.000000,0.029004,0.000000,0.000000,0.026010,0.027762,0.021839,0.000000,0.028812
5,0.052566,0.000000,0.036480,0.000000,0.000000,0.000000,0.000000,0.000000,0.342962,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.208309,0.000000,0.175691,0.000000,...,0.043369,0.000000,0.031726,0.000000,0.000000,0.000000,0.000000,0.143333,0.000000,0.000000
7,0.148427,0.000000,0.103004,0.000000,0.000000,0.000000,0.110225,0.000000,0.154942,0.000000,...,0.000000,0.135027,0.000000,0.000000,0.000000,0.000000,0.000000,0.126406,0.000000,0.000000
8,0.029545,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.262156,0.000000,...,0.000000,0.000000,0.033416,0.057698,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.283827,0.000000,0.000000,0.215268,0.000000,0.000000,0.000000,0.000000,0.148143,0.273708,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [10]:
# Pickling Vect_1
#from sklearn.externals import joblib
#joblib.dump(vect, 'vect_1.sav')

In [None]:
# Saving dtm.csv (Original)
#dtm.to_csv(index=False)
#dtm.to_csv(r'dtm.csv')

In [12]:
# Pickling Vect_2 (135,000 Rows)
from sklearn.externals import joblib
joblib.dump(vect2, 'vect_2.sav')

['vect_2.sav']

In [18]:
# Saving dtm2.csv 
dtm2.to_csv(index=False)
dtm2.to_csv(r'dtm2.csv')

# DTM_Final

In [2]:
dtm_final = pd.read_csv('dtm_final.csv')