# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp
                  
# Spacy Installs: 

   - python -m spacy download en_core_web_lg

        - python -m spacy link en_core_web_lg en

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
nlp = spacy.load('en')

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')
# Look inside the bucket.
bucket.contents

['API/',
 'API/api.py',
 'API/api_exploration.ipynb',
 'Environments/',
 'Environments/environment.yml',
 'Flask_App/',
 'Flask_App/Pipfile',
 'Flask_App/__init__.py',
 'Flask_App/app.py',
 'Flask_App/models.py',
 'Flask_App/yelp.py',
 'Model/',
 'Model/vect_1.sav',
 'datasets/',
 'datasets/df.csv',
 'datasets/dtm.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/official_NB.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [None]:
### DO NOT RUN #### 
### ALREADY INSTALLED ###

# Only have to run this once.
# Installs the .csv 'Locally' on SageMaker Instance

#bucket.get('datasets/df.csv', 'df.csv')

    # Load in Bucket
# bucket = s3.Bucket('yelpchallenge1')
    # Look inside the bucket.
# bucket.contents

# Getting Started

In [5]:
# Read-in df.csv
df = pd.read_csv('df.csv')
# Dropping Column
df = df.drop(columns=['Unnamed: 0'])
# Dropping all Missing / Na Values from Entire Dataframe
df = df.dropna()

In [6]:
# Checking Null Values and Shape
print(df.isna().sum())
print(df.shape)

stars          0
text           0
date           0
total_votes    0
tokens         0
dtype: int64
(6685874, 5)


In [36]:
# Read-in dtm.csv (Original)
dtm = pd.read_csv('dtm.csv')
dtm = dtm.drop(columns=['Unnamed: 0'])

# Read-in dtm2.csv (New)
dtm2 = pd.read_csv('dtm2.csv')
dtm2 = dtm2.drop(columns=['Unnamed: 0'])
dtm2.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,-PRON-,i,the,this,$,'s,-PRON-.1,-PRON-.2,...,work,worth,would,would be,would have,would not,wrong,year,yelp,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.397408,0.0,0.069829,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.400392,0.0,0.158778,0.0,0.134316,0.0,0.0,0.082813,0.343331,0.0,...,0.0,0.052035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.218785,0.0,0.0,0.165937,0.085626,0.0,0.081237,0.0,0.314034,0.0,...,0.084567,0.0,0.0,0.0,0.0,0.0,0.0,0.093163,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080641,0.0,...,0.0,0.0,0.087372,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018564,0.515313,0.0,...,0.0,0.0,0.029004,0.0,0.0,0.02601,0.027762,0.021839,0.0,0.028812


In [37]:
# import Vectorizer models
vect = joblib.load('vect_1.sav')
vect2 = joblib.load('vect_2.sav')

# Data Merging

In [44]:
            # ***** New DTM DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #

# Taking Stars Column
#stars = df['stars']

# Adding stars column to dtm
#dtm['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
#cols = list(dtm.columns)
#cols = [cols[-1]] + cols[:-1]
#dtm = dtm[cols]

# Dropping "-PRON-", 'year -PRON-', and ' ' Columns
#dtm = dtm.drop(columns=[' ', '  -PRON-', 'year -PRON-'])


In [38]:
             # ***** New DTM2 DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #
# Taking Stars Column
stars = df['stars']

# Adding stars column to dtm
dtm2['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
cols = list(dtm2.columns)
cols = [cols[-1]] + cols[:-1]
dtm2 = dtm2[cols]


In [40]:
# Saving dtm2.csv 
dtm2.to_csv(index=False)
dtm2.to_csv(r'dtm2.csv')

# Training Prep: 

In [17]:
df.total_votes.shape
df.total_votes.head()


0    7.0
1    0.0
2    3.0
3    0.0
4    7.0
Name: total_votes, dtype: float64

# Classification Test: df['text'] & df['total_votes']
To Predict the Total Votes based on the Users Review that they put in themselves. 
Create X & y train variables from Original df. 

- Taking First 10,000 Rows of df['text'] for X_train.
        
- Taking First 10,000 Rows of df['total_votes'] for y_train.
 
Take a Random Review from DF to Test. 

- df['text'][45889]


In [20]:
# Creating Training DFs:
text_train = df.text[0:10000]
vote_train = df.total_votes[0:10000]

In [21]:
# Training:
X_train = text_train
y_train = vote_train

# Testing: 
random_test = df['text'][45889]

In [22]:
print(X_train.shape)
print(y_train.shape)
print(random_test)

(10000,)
(10000,)
5 stars all the way for Jenaro and Dr. Cool.  I recently posted a review on Air Pro Max LLC who pretty much sucked so bad.   Jenaro called me and I was at work.  I told him and explained to him what the tech from Air Pro Max did.  The Air Pro Max LLC technician installed the blower incorrectly and was unable to diagnose my problem that took him 7 hours.Jenaro fixed my issue in less than 40 minutes!!!Dr. Cool AC and Jenaro, you rock!!!


In [23]:
# Imports
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# Create Pipeline Components

vect = TfidfVectorizer(stop_words='english')
rfc = RandomForestClassifier()


# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect), 
                 # Classifier
                 ('clf', rfc)
                ])

#Tuning
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (500, 1000), 'clf__n_estimators': (5, 10), 'clf__max_depth': (15, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

# Classification Test: df['text'] & df['stars]