# YELP DATASET


## Goal
1. Rename columns
2. Reset Index
3. Get Restaurant Reviews + Businesses only
4. Get Users that have more than 50 reviews only
5. Clean Text ( lower , lemmatizing , remove punctuation )
5. Add Features to User Panda Frame
    - Elite Status --- 1: Yes 0:No
    - Fill Null with 0 
    - Number of friends
    - Number of Tips Given --- Merge with tips pandaframe
7. Check to see continuous vs discrete data in pd

    

Dataset Source: https://www.yelp.com/dataset_challenge

In [1]:
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition.online_lda import LatentDirichletAllocation
import string
from sklearn.pipeline import Pipeline

Populating the interactive namespace from numpy and matplotlib


# Read Data

In [2]:
user = pd.read_csv('csv/user.csv')
business = pd.read_csv('csv/business.csv')
review = pd.read_csv('csv/review.csv')
tip = pd.read_csv('csv/tip.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def rename_column(file):
    new_names = list()
    for col in file:
        new_names.append(col.replace('.', '_', len(col)).replace(' ', '_', len(col)).lower())
    return new_names

In [4]:
user.columns = rename_column(user.columns)
business.columns = rename_column(business.columns)
business.columns = [ re.sub('attributes_', '', c) for c in business.columns ]
review.columns = rename_column(review.columns)

In [5]:
user = user.ix[user['review_count'] > 50]
business = business.ix[business['review_count'] > 50]

In [6]:
user = user.reset_index()
business = business.reset_index()

# Get Restaurants Businesses and Reviews

In [7]:
business.categories[:3]

0    ['Breakfast & Brunch', 'Sandwiches', 'Restaura...
1                           ['Restaurants', 'Italian']
2              ['Burgers', 'Fast Food', 'Restaurants']
Name: categories, dtype: object

In [8]:
# create a dataframe with business id and category
all_businesses = pd.concat([business['business_id'], business['categories']], axis=1, keys=['business_id', 'category'])

In [9]:
# get index of businesses that are not restaurants
not_restaurant = [i for i,e in enumerate(all_businesses.values) if "Restaurant" not in e[1]]

In [10]:
all_businesses.drop(all_businesses.index[not_restaurant], inplace=True)

In [11]:
restaurant = all_businesses # after dropping all businesses that are not restaurants business.tail(2)

In [12]:
restaurant = restaurant.reset_index() # reset the index 

In [13]:
restaurant.head(2)

Unnamed: 0,index,business_id,category
0,0,b9WZJp5L1RZr4F1nxclOoQ,"['Breakfast & Brunch', 'Sandwiches', 'Restaura..."
1,1,P1fJb2WQ1mXoiudj8UE44w,"['Restaurants', 'Italian']"


In [14]:
review_indices = review['business_id'].isin(restaurant['business_id'])

In [15]:
restaurant_review = review[review_indices]

In [16]:
restaurant_review.head(2)

Unnamed: 0,user_id,review_id,text,votes_cool,business_id,votes_funny,stars,date,type,votes_useful
121,PrMlXX6fbMsJie9ausN41g,Phd_OwFhKQptiVL5Tbl-Lw,If you want a true understanding of Pittsburgh...,1,b9WZJp5L1RZr4F1nxclOoQ,0,3,2007-03-31,review,2
122,FNbm3ycU2BF8C17UFfWzOg,uSoZMwdnhiegEpbXCwWATw,"Good Luck getting a seat, that's all I have to...",0,b9WZJp5L1RZr4F1nxclOoQ,0,4,2007-08-02,review,0


In [17]:
len(restaurant_review)

1366824

In [18]:
#clean text data
def clean_text(x):
    wordnet = WordNetLemmatizer()
    words =  [re.sub('['+string.punctuation+']', '', i.lower().replace('\n','')) for i in x.split()]
    lemmetized_words = [wordnet.lemmatize(word) for word in words ]
    return " ".join(lemmetized_words)

In [19]:
restaurant_review = restaurant_review.reset_index()

# Add Features to User PD

In [20]:
user.head(2)

Unnamed: 0,index,yelping_since,compliments_plain,review_count,friends,compliments_cute,compliments_writer,fans,compliments_note,type,...,compliments_more,elite,name,user_id,votes_cool,compliments_list,votes_funny,compliments_photos,compliments_funny,votes_useful
0,0,2004-10,25.0,108,"['rpOyqD_893cqmDAtJLbdog', '4U9kSBLuBDU391x6bx...",15.0,9.0,69,20.0,user,...,3.0,"[2005, 2006]",Russel,18kPq7GPye-YQ3LyKyAZPw,246,,167,14.0,11.0,282
1,1,2004-10,970.0,1292,"['18kPq7GPye-YQ3LyKyAZPw', '4U9kSBLuBDU391x6bx...",204.0,346.0,1345,611.0,user,...,137.0,"[2005, 2006, 2007, 2008, 2009, 2010, 2011, 201...",Jeremy,rpOyqD_893cqmDAtJLbdog,12091,38.0,8399,361.0,594.0,15242


In [21]:
# User membership period in days
user['yelping_period'] = [(pd.to_datetime('2016-11') - pd.Timestamp(i)).days for i in user['yelping_since'] ]

In [22]:
# Get the first year of elite status
user['elite_since'] = [i.replace('[','').replace(']','').split(',')[0] for i in user['elite']]
user['elite_since'] = [int(i) if i!= '' else 0 for i in user['elite_since'] ]

In [23]:
# Label members --- 1: Elite 0: None Elite
user['elite_status'] = [0 if i == 0 else 1 for i in user['elite_since']]

In [24]:
# Get number of friends 
user['n_friends'] = [len(i) for i in user['friends']]

In [25]:
# Count number of tips each user gives
tip_count = pd.DataFrame(tip['user_id'].value_counts())
tip_count = tip_count.reset_index()
tip_count.columns = ['user_id','tip_count']

In [26]:
# Merge tip count and user
user = user.merge(tip_count,how='left', left_on='user_id', right_on='user_id')

In [27]:
# Fill Null with 0 
user = user.fillna(0)

In [28]:
user.tail(2)

Unnamed: 0,index,yelping_since,compliments_plain,review_count,friends,compliments_cute,compliments_writer,fans,compliments_note,type,...,compliments_list,votes_funny,compliments_photos,compliments_funny,votes_useful,yelping_period,elite_since,elite_status,n_friends,tip_count
70993,686505,2013-03,7.0,126,['6yVMuF3F-_BTovLIw3z0sA'],0.0,0.0,0,1.0,user,...,0.0,14,0.0,0.0,104,1341,0,0,26,0.0
70994,686552,2015-09,0.0,51,[],0.0,0.0,0,0.0,user,...,0.0,4,0.0,0.0,6,427,0,0,2,0.0


In [29]:
(user.columns)

Index(['index', 'yelping_since', 'compliments_plain', 'review_count',
       'friends', 'compliments_cute', 'compliments_writer', 'fans',
       'compliments_note', 'type', 'compliments_hot', 'compliments_cool',
       'compliments_profile', 'average_stars', 'compliments_more', 'elite',
       'name', 'user_id', 'votes_cool', 'compliments_list', 'votes_funny',
       'compliments_photos', 'compliments_funny', 'votes_useful',
       'yelping_period', 'elite_since', 'elite_status', 'n_friends',
       'tip_count'],
      dtype='object')

# Check all PD before Export

In [30]:
business = business.drop(['index','type'],axis = 1)

In [31]:
restaurant_review = restaurant_review.drop(['index','type'],axis = 1)

In [32]:
# drop unnecessary column 
user = user.drop(['index','type','name','friends','elite_since'],axis = 1)

In [33]:
user.head(2)

Unnamed: 0,yelping_since,compliments_plain,review_count,compliments_cute,compliments_writer,fans,compliments_note,compliments_hot,compliments_cool,compliments_profile,...,votes_cool,compliments_list,votes_funny,compliments_photos,compliments_funny,votes_useful,yelping_period,elite_status,n_friends,tip_count
0,2004-10,25.0,108,15.0,9.0,69,20.0,48.0,76.0,8.0,...,246,0.0,167,14.0,11.0,282,4414,1,5200,0.0
1,2004-10,970.0,1292,204.0,346.0,1345,611.0,1111.0,1675.0,117.0,...,12091,38.0,8399,361.0,594.0,15242,4414,1,50414,6.0


In [34]:
restaurant_review.head(2)

Unnamed: 0,user_id,review_id,text,votes_cool,business_id,votes_funny,stars,date,votes_useful
0,PrMlXX6fbMsJie9ausN41g,Phd_OwFhKQptiVL5Tbl-Lw,If you want a true understanding of Pittsburgh...,1,b9WZJp5L1RZr4F1nxclOoQ,0,3,2007-03-31,2
1,FNbm3ycU2BF8C17UFfWzOg,uSoZMwdnhiegEpbXCwWATw,"Good Luck getting a seat, that's all I have to...",0,b9WZJp5L1RZr4F1nxclOoQ,0,4,2007-08-02,0


In [35]:
business.head(2)

Unnamed: 0,ambience_divey,dietary_restrictions_vegan,happy_hour,hours_thursday_open,order_at_counter,hair_types_specialized_in_africanamerican,hair_types_specialized_in_kids,byob,hours_friday_open,good_for_latenight,...,noise_level,smoking,attire,hair_types_specialized_in_curly,good_for_groups,neighborhoods,open_24_hours,ambience_romantic,music_jukebox,ambience_upscale
0,True,,,06:00,True,,,False,06:00,False,...,average,,casual,,False,[],,False,,False
1,False,,,11:00,,,,,11:00,False,...,average,,casual,,True,['Carnegie'],,False,,False


In [36]:
tip.head(1)

Unnamed: 0,user_id,text,business_id,likes,date,type
0,ZxdojCZgZhw36xuSGaryRg,Pizza is garbage. Hoagies are excellent.,5UmKMjUEUNdYWqANhGckJw,0,2016-07-17,tip


# Minor Tweak

# Export to pickle

In [37]:
restaurant_review.to_pickle('data/review') 
business.to_pickle('data/business')
user.to_pickle('data/user')


In [38]:
checking = pd.read_pickle('data/user')

In [39]:
checking.head(2)

Unnamed: 0,yelping_since,compliments_plain,review_count,compliments_cute,compliments_writer,fans,compliments_note,compliments_hot,compliments_cool,compliments_profile,...,votes_cool,compliments_list,votes_funny,compliments_photos,compliments_funny,votes_useful,yelping_period,elite_status,n_friends,tip_count
0,2004-10,25.0,108,15.0,9.0,69,20.0,48.0,76.0,8.0,...,246,0.0,167,14.0,11.0,282,4414,1,5200,0.0
1,2004-10,970.0,1292,204.0,346.0,1345,611.0,1111.0,1675.0,117.0,...,12091,38.0,8399,361.0,594.0,15242,4414,1,50414,6.0
