# YELP DATASET


## Goal
1. Rename columns
2. Reset Index
3. Get Restaurant Reviews + Businesses only
4. Get Users that have more than 50 reviews only
5. Clean Text ( lower , lemmatizing , remove punctuation )
5. Add Features to User Panda Frame
    - Elite Status --- 1: Yes 0:No
    - Fill Null with 0 
    - Number of friends
    - Number of Tips Given --- Merge with tips pandaframe
7. Check to see continuous vs discrete data in pd

    

Dataset Source: https://www.yelp.com/dataset_challenge

In [None]:
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition.online_lda import LatentDirichletAllocation
import string
from sklearn.pipeline import Pipeline

Populating the interactive namespace from numpy and matplotlib


# Read Data

In [None]:
user = pd.read_csv('csv/user.csv')
business = pd.read_csv('csv/business.csv')
review = pd.read_csv('csv/review.csv')
tip = pd.read_csv('csv/tip.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
def rename_column(file):
    new_names = list()
    for col in file:
        new_names.append(col.replace('.', '_', len(col)).replace(' ', '_', len(col)).lower())
    return new_names

In [None]:
user.columns = rename_column(user.columns)
business.columns = rename_column(business.columns)
business.columns = [ re.sub('attributes_', '', c) for c in business.columns ]
review.columns = rename_column(review.columns)

In [None]:
user = user.ix[user['review_count'] > 50]
business = business.ix[business['review_count'] > 50]

In [None]:
user = user.reset_index()
business = business.reset_index()

# Get Restaurants Businesses and Reviews

In [None]:
business.categories[:3]

In [None]:
# create a dataframe with business id and category
all_businesses = pd.concat([business['business_id'], business['categories']], axis=1, keys=['business_id', 'category'])

In [None]:
# get index of businesses that are not restaurants
not_restaurant = [i for i,e in enumerate(all_businesses.values) if "Restaurant" not in e[1]]

In [None]:
all_businesses.drop(all_businesses.index[not_restaurant], inplace=True)

In [None]:
restaurant = all_businesses # after dropping all businesses that are not restaurants business.tail(2)

In [None]:
restaurant = restaurant.reset_index() # reset the index 

In [None]:
restaurant.head(2)

In [None]:
review_indices = review['business_id'].isin(restaurant['business_id'])

In [None]:
restaurant_review = review[review_indices]

In [None]:
restaurant_review.head(2)

In [None]:
len(restaurant_review)

In [None]:
#clean text data
def clean_text(x):
    wordnet = WordNetLemmatizer()
    words =  [re.sub('['+string.punctuation+']', '', i.lower().replace('\n','')) for i in x.split()]
    lemmetized_words = [wordnet.lemmatize(word) for word in words ]
    return " ".join(lemmetized_words)

In [None]:
restaurant_review = restaurant_review.reset_index()

# Add Features to User PD

In [None]:
user.head(2)

In [None]:
# User membership period in days
user['yelping_period'] = [(pd.to_datetime('2016-11') - pd.Timestamp(i)).days for i in user['yelping_since'] ]

In [None]:
# Get the first year of elite status
user['elite_since'] = [i.replace('[','').replace(']','').split(',')[0] for i in user['elite']]
user['elite_since'] = [int(i) if i!= '' else 0 for i in user['elite_since'] ]

In [None]:
# Label members --- 1: Elite 0: None Elite
user['elite_status'] = [0 if i == 0 else 1 for i in user['elite_since']]

In [None]:
# Get number of friends 
user['n_friends'] = [len(i) for i in user['friends']]

In [None]:
# Count number of tips each user gives
tip_count = pd.DataFrame(tip['user_id'].value_counts())
tip_count = tip_count.reset_index()
tip_count.columns = ['user_id','tip_count']

In [None]:
# Merge tip count and user
user = user.merge(tip_count,how='left', left_on='user_id', right_on='user_id')

In [None]:
# Fill Null with 0 
user = user.fillna(0)

In [None]:
user.tail(2)

In [None]:
(user.columns)

# Check all PD before Export

In [None]:
business = business.drop(['index','type'],axis = 1)

In [None]:
restaurant_review = restaurant_review.drop(['index','type'],axis = 1)

In [None]:
# drop unnecessary column 
user = user.drop(['index','type','name','friends','elite_since'],axis = 1)

In [None]:
user.head(2)

In [None]:
restaurant_review.head(2)

In [None]:
business.head(2)

In [None]:
tip.head(1)

# Minor Tweak

# Export to pickle

In [None]:
restaurant_review.to_pickle('data/review') 
business.to_pickle('data/business')
user.to_pickle('data/user')


In [None]:
checking = pd.read_pickle('data/user')

In [None]:
checking.head(2)