# Start Up: 
Before Running the cell below, you must ensure that these have been run in Terminal **IN ORDER** : 
- conda update -n base -c defaults conda 

    - cd SageMaker
    
      - cd yelp-dataset-challenge-1-ds
      
         - conda env create -f environment.yml
          
            - source activate ydc1 
                
                - pip install python-decouple
                  
                  - pip install pprintpp

# Imports: 

In [2]:
import pandas as pd
import s3
from pprintpp import pprint as pp
from sklearn.externals import joblib
import json

# Load in Bucket
bucket = s3.Bucket('yelpchallenge1')

# Look inside Bucket 
bucket.contents

['API/',
 'API/api.py',
 'API/api_exploration.ipynb',
 'Environments/',
 'Environments/environment.yml',
 'Flask_App/',
 'Flask_App/Pipfile',
 'Flask_App/__init__.py',
 'Flask_App/app.py',
 'Flask_App/models.py',
 'Flask_App/yelp.py',
 'Model/',
 'Model/vect_1.sav',
 'datasets/',
 'datasets/df.csv',
 'datasets/dtm.csv',
 'datasets/dtm_final.csv',
 'notebooks/',
 'notebooks/data_cleanup.ipynb',
 'notebooks/official_NB.ipynb',
 'notebooks/vectorization_exploration.ipynb',
 'notebooks/yelp_data_initial_exploration.ipynb']

In [None]:
                    ### ***** DO NOT RUN. ******* #### 
                  ### ***** ALREADY INSTALLED. ****** ###

    # Installs the File 'Locally' on SageMaker Instance / Only have to run these once: 

bucket.get('datasets/df.csv', 'df.csv')

    # Installing .json Files 'Locally'
    
bucket.get('datasets/user.json', 'user.json')
bucket.get('datasets/review.json', 'review.json')

# Cleaning Data: Complete as of ***8:14 PM : 12/19/2019***

Cleaning df.csv & saving Cleaned df.csv

In [None]:
                    ### ***** DO NOT RUN. ******* #### 
                  ### ***** ALREADY COMPLETE. ****** ###
# Further Cleaning of df.csv: 
# Import 
df = pd.read_csv('df.csv')
# Dropping Columns:
#df = df.drop(columns=['Unnamed: 0', 'stars'])

# Dropping all Missing / Na Values from Entire Dataframe:
df = df.dropna()

    # Saving Cleaned df.csv 
df.to_csv(index=True)
df.to_csv(r'df.csv')

Converting user_json & to Pandas DataFrame / Saving as user.csv & review.csv

In [None]:
                              # ******* DO NOT RUN! ******* # 
                            # ***** ALREADY COMPLETE. ****** # 
 # import user.json
with open('user.json') as f:
    user = json.loads("[" + 
                      f.read().replace("}\n{", "},\n{") + 
                      "]")
    
    # convert user.json files to pandas DataFrame 'user_df'
user_df = pd.DataFrame(user)

    # Saving user_df as csv file. 
user_df.to_csv(index=True)
user_df.to_csv(r'user.csv')

# Import review.json 
with open('review.json') as f:
    review = json.loads("[" + 
                        f.read().replace("}\n{", "},\n{") + 
                        "]")
    
    # convert review.json files to pandas DataFrame 'review_df'
review_df = pd.DataFrame(review)

    # Saving user_df as csv file. 
review_df.to_csv(index=True)
review_df.to_csv(r'review.csv')

# Data Merging: Complete as of ***8:14 PM : 12/19/2019***

In [None]:
            # ***** New DTM DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #
# Read-in dtm.csv (Original)
dtm = pd.read_csv('dtm.csv')

    # Taking Stars Column
stars = df['stars']

    # Adding stars column to dtm
dtm['stars']=df['stars']

# Shifting 'Stars' Column to front of Df,
cols = list(dtm.columns)
cols = [cols[-1]] + cols[:-1]
dtm = dtm[cols]

    # Dropping "-PRON-", 'year -PRON-', and ' ' Columns
dtm = dtm.drop(columns=[' ', '  -PRON-', 'year -PRON-'])#Cut 135,000 Rows of df['stars'] Column to fix Memory Error. 
    # Label as "stars"
stars = df.stars[0:135000]
stars.shape
    # Adding stars to dtm2
dtm2['stars']=df['stars'][0:135000]

In [None]:
             # ***** New DTM2 DF HAS BEEN CREATED. DO NOT RUN THIS CELL **** #
    
    # Read-in dtm2.csv(Old)
dtm2 = pd.read_csv('dtm2.csv')

    # Taking Stars Column
stars = df['stars']

    # Adding stars column to dtm
dtm2['stars']=df['stars']

    # Shifting 'Stars' Column to front of Df,
cols = list(dtm2.columns)
cols = [cols[-1]] + cols[:-1]
dtm2 = dtm2[cols]

dtm2 = dtm2.drop(columns=['stars'])

    # Dropping columns: 
dtm2 = dtm2.drop(columns=[' ' , '  '])
dtm2 = dtm2.drop(columns=['  -PRON-','  i',  '  the',  '  this', '$', "'s"])
    # Saving dtm2.csv 
dtm2.to_csv(index=True)
dtm2.to_csv(r'dtm2.csv')

    # Cut 135,000 Rows of df['stars'] Column to fix Memory Error. 
        # Label as "stars"     
stars = df.stars[0:135000]
stars.shape

        # Adding stars to dtm2
dtm2['stars']=df['stars'][0:135000]

    # Saving Final df as 'dtm_final'
dtm_final = dtm2

    # Saving dtm_final.csv 
dtm_final.to_csv(index=True)
dtm_final.to_csv(r'dtm_final.csv')


In [None]:
# Read-in dtm_final.csv (FINAL)
#dtm_final = pd.read_csv('dtm_final.csv')

# Clean / Analyze user.csv: Complete as of ***10:14 PM 12/19/2019***

In [None]:
# Imports

    # Read-in user.csv
user = pd.read_csv('user.csv')
    # Read-in review.csv
review = pd.read_csv('review.csv')

In [None]:
# Check Read-in of df_user
# Checking Null Values and Shape
pp(user.isna().sum())
pp(user.shape)

Three Problems: 
**Problem 1:**
user['Unamed: 0'] should not exist. 

**Problem 2:**
user['elite'] has 1,565,761 Missing Values. 

**Problem 3:** 
user['name'] has 3 Missing Values. 

**Solution?:**
Drop user['Unamed: 0' , 'elite']  Columns.

Drop Missing Values.

In [None]:
# Solution:
user = user.drop(columns=['Unnamed: 0', 'elite' ])
user = user.dropna()
# Save Cleaned user_df.csv 
user.to_csv(index=True)
user.to_csv(r'user.csv')

In [None]:
user.columns

In [None]:
# drop unused columns from user_df
user = user.drop(columns=['average_stars', 'compliment_cool', 'compliment_cute',
       'compliment_funny', 'compliment_hot', 'compliment_list',
       'compliment_more', 'compliment_note', 'compliment_photos',
       'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool', 'friends', 'funny', 'useful'])

# Save Cleaned user_df.csv 
user.to_csv(index=True)
user.to_csv(r'user.csv')

# Clean / Analyze review.csv: 

In [None]:
# Check Read-in of review.csv
# Checking Null Values and Shape
pp(review.isna().sum())
pp(review.shape)

**Minor Problem(s) with a Simple Solution**

**Problems?:** 
review['date', 'funny', 'review_id', 'stars', 'text', 'useful'] Columns have NaN's. 

review['Unnamed: 0'] Not Supposed to be there.

**Solution?:**
Drop Missing Values from review DataFrame. 

In [None]:
# Solution: 
review = review.dropna()
review = review.drop(columns=['Unnamed: 0', 'stars', 'business_id'])
# Save Cleaned review_df.csv 
review.to_csv(index=True)
review.to_csv(r'review.csv')

In [None]:
review = review.drop(columns=['text'])
review.to_csv(index=True)
review.to_csv(r'review.csv')

In [None]:
#Adding df['text', 'tokens'] to review.csv
review['text'] = df['text']
review['tokens'] = df['tokens']
#review = review.drop(columns=['tokens'])

In [None]:
review.head(5)

# Combining review.csv, & df.csv 

**Description**: 

Combining based on their *Unique Account ID's.*
The end Product will be One DataFrame Consisting of Each Account:
- **Name**, 
- **User_ID**,
- **Review_ID**,
- **Text**,
- **That Users respective review(s)**,
- **Interactions that Review (i.e: Cool, Funny, Useful)**  

The goal of the model is to have the ability to type in the Review you are wanting to post on Yelp, and give the User the ability to Predict What type of Interaction they would potentially receive and Total Number of each interaction. The model Accuracy will be Displayed beside the Prediction. 

In [None]:
# Changing Layout of Columns 
final_df = review[['user_id', 'date', 'review_id', 'useful', 'funny', 'cool', 'text']]

#Saving Final_df
final_df.to_csv(index=True)
final_df.to_csv(r'final.csv')

In [None]:
# Checking Null Values and Shape
pp(final_df.isna().sum())
pp(final_df.shape)

In [None]:
# Dropping Null Values from [text] column
final = final_df.dropna()

In [None]:
# Checking Null Values and Shape
pp(final.isna().sum())
pp(final.shape)

In [None]:
#Saving Final
final.to_csv(index=True)
final.to_csv(r'final.csv')

In [None]:
final.dtypes

In [None]:
final['cool'] = final.cool.astype(float)

In [None]:
final.dtypes

In [None]:
#Saving Final
final.to_csv(index=True)
final.to_csv(r'final.csv')

In [8]:
final = pd.read_csv('final.csv') 
final = final.drop(columns=['Unnamed: 0'])

In [9]:
final.columns

Index(['user_id', 'date', 'review_id', 'useful', 'funny', 'cool', 'text',
       'tokens'],
      dtype='object')

In [10]:
#Saving Final
final.to_csv(index=True)
final.to_csv(r'final.csv')

# Some Visulizations: 

In [None]:
# Imports 
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt


# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(final['useful']).set_title('Useful Interaction Distribution');