# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp
                  
# Spacy Installs: 

   - python -m spacy download en_core_web_lg

        - python -m spacy link en_core_web_lg en

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
nlp = spacy.load('en')

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')
# Look inside the bucket.
bucket.contents

['API/',
 'API/api.py',
 'API/api_exploration.ipynb',
 'Environments/',
 'Environments/environment.yml',
 'Flask_App/',
 'Flask_App/Pipfile',
 'Flask_App/__init__.py',
 'Flask_App/app.py',
 'Flask_App/models.py',
 'Flask_App/yelp.py',
 'Model/',
 'Model/vect_1.sav',
 'datasets/',
 'datasets/df.csv',
 'datasets/dtm.csv',
 'datasets/dtm_final.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/official_NB.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [None]:
                    ### ***** DO NOT RUN. ******* #### 
                  ### ***** ALREADY INSTALLED. ****** ###

# Only have to run this once.
# Installs the .csv 'Locally' on SageMaker Instance

#bucket.get('datasets/df.csv', 'df.csv')

    # Load in Bucket
# bucket = s3.Bucket('yelpchallenge1')
    # Look inside the bucket.
# bucket.contents

In [None]:
                            # ******* ALREADY Completed. DO NOT RUN. ****
    
# Further Cleaning of df.csv 

    # Dropping Column
# df = df.drop(columns=['Unnamed: 0'])

    # Dropping all Missing / Na Values from Entire Dataframe
# df = df.dropna()


# Getting Started

In [2]:
# Read-in df.csv
df = pd.read_csv('df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Saving Further Cleaned df.csv 
df.to_csv(index=False)
df.to_csv(r'df.csv')

In [4]:
# Checking Null Values and Shape
print(df.isna().sum())
print(df.shape)
print(df.dtypes)

stars          0
text           0
date           0
total_votes    0
tokens         0
dtype: int64
(6685874, 5)
stars           object
text            object
date            object
total_votes    float64
tokens          object
dtype: object


In [5]:
# Read-in dtm.csv (Original)
#dtm = pd.read_csv('dtm.csv')
#dtm = dtm.drop(columns=['Unnamed: 0'])

# Read-in dtm2.csv (Old)
#dtm2 = pd.read_csv('dtm2.csv')
#dtm2 = dtm2.drop(columns=['Unnamed: 0'])
#dtm2.head()

# Read-in dtm_final.csv (FINAL)
dtm_final = pd.read_csv('dtm_final.csv')

In [5]:
# import Vectorizer models
#vect = joblib.load('vect_1.sav')
vect2 = joblib.load('vect_2.sav')

# Data Merging
**Do NOT Run Any Cells in this Markdown**

**COMPLETE**

In [None]:
            # ***** New DTM DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #

# Taking Stars Column
#stars = df['stars']

# Adding stars column to dtm
#dtm['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
#cols = list(dtm.columns)
#cols = [cols[-1]] + cols[:-1]
#dtm = dtm[cols]

# Dropping "-PRON-", 'year -PRON-', and ' ' Columns
#dtm = dtm.drop(columns=[' ', '  -PRON-', 'year -PRON-'])


In [None]:
             # ***** New DTM2 DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #
# Taking Stars Column
#stars = df['stars']

# Adding stars column to dtm
#dtm2['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
#cols = list(dtm2.columns)
#cols = [cols[-1]] + cols[:-1]
#dtm2 = dtm2[cols]

#dtm2 = dtm2.drop(columns=['stars'])
# Dropping columns: 
#dtm2 = dtm2.drop(columns=[' ' , '  '])
#dtm2 = dtm2.drop(columns=['  -PRON-','  i',  '  the',  '  this', '$', "'s"])
# Saving dtm2.csv 
#dtm2.to_csv(index=False)
#dtm2.to_csv(r'dtm2.csv')

In [None]:
# Saving dtm2.csv 
#dtm2.to_csv(index=False)
#dtm2.to_csv(r'dtm2.csv')

In [35]:
#Cut 135,000 Rows of df['stars'] Column to fix Memory Error. 
# Label as "stars"
#stars = df.stars[0:135000]
#stars.shape
# Adding stars to dtm2
#dtm2['stars']=df['stars'][0:135000]

In [None]:
#dtm2.head(100000)

In [39]:
#dtm_final = dtm2

In [None]:
#dtm_final.head(10000)

In [41]:
# Saving dtm_final.csv 
#dtm_final.to_csv(index=False)
#dtm_final.to_csv(r'dtm_final.csv')

# Model Prep: 

In [42]:
# Imports: 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
# Creating Training Data. 
    # X_train will include All 135,000 Rows, for 773 Vectorized Words(Columns)
    # y_train or the Target Variable  will include all 135,000 Rows, for the stars Column.     
X_train = dtm_final.iloc[:, 0:773]
y_train = dtm_final.iloc[:, 773:774]

In [45]:
# Create Pipeline Components

vect = TfidfVectorizer(stop_words='english')
rfr = RandomForestRegressor()


# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect), 
                 # Classifier
                 ('clf', rfc)
                ])

#Tuning
parameters = {
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,10000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.


ValueError: Unknown label type: 'continuous'