# User Data Inspection

In [1]:
import random
random.seed(109)

from pprint import pprint

import os
import sys
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

## User Data

User data containing basic info and several numeric features about their interactions on Yelp.

In [2]:
pd.options.display.max_columns = 30

In [3]:
user_df = pd.read_feather('data/yelp_user.feather')

In [4]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 333.7+ MB


In [5]:
user_df.head(3)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91,250,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74,1145,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,3.32,89,13,10,17,3,66,96,119,119,35,18


### Filtering (keep only the reviewers from the processed review data)

Load cleaned review data:

In [6]:
review_cleaned_df = pd.read_feather('data/yelp_review_cleaned.feather')

In [7]:
review_cleaned_df.head(1)

Unnamed: 0,review_id,user_id,business_id,text,date,stars,useful,funny,cool
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,-0.577114,-0.36596,-0.184315,-0.229438


In [8]:
user_filtered_df = user_df.loc[user_df['user_id'].isin(review_cleaned_df['user_id'])].copy()

In [9]:
del user_df  # save memory
user_df = user_filtered_df

In [10]:
user_filtered_df.head(3)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91,250,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74,1145,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,3.32,89,13,10,17,3,66,96,119,119,35,18


In [11]:
user_df.shape

(1532223, 22)

### Data Missingness

In [12]:
user_df.isnull().sum()

user_id               0
name                  0
review_count          0
yelping_since         0
useful                0
funny                 0
cool                  0
elite                 0
friends               0
fans                  0
average_stars         0
compliment_hot        0
compliment_more       0
compliment_profile    0
compliment_cute       0
compliment_list       0
compliment_note       0
compliment_plain      0
compliment_cool       0
compliment_funny      0
compliment_writer     0
compliment_photos     0
dtype: int64

We observe no missing data.

### Feature Simplification

We note that features `elite` and `friends` are lists of values. In the context of our model (recommendation system), the specific values do not matter that much, so we will aggretate the said variables by transforming them into counts. Aggregating instead of splitting can also potentially reduce overfitting.

In [13]:
user_df['elite_count'] = user_df['elite'].str.count(',') + 1
user_df['friends_count'] = user_df['friends'].str.count(',') + 1

In [14]:
user_df.drop(['elite', 'friends'], axis=1, inplace=True)

In [15]:
user_df.loc[:2, ['elite_count', 'friends_count']]

Unnamed: 0,elite_count,friends_count
0,1,14995
1,14,4646
2,5,381


We will also convert `yelping_since` into `membership_days` for how long a user has been a member. We will use March 28, 2023 as the baseline "today":

In [16]:
today = pd.to_datetime('2023-03-28')

In [17]:
user_df['membership_days'] = (today - pd.to_datetime(user_df['yelping_since'])).dt.days
user_df['membership_days'].head(3)

0    5905
1    5174
2    5358
Name: membership_days, dtype: int64

In [18]:
user_df.drop('yelping_since', axis=1, inplace=True)

### Feature Scaling

We will scale all numeric features:

In [19]:
num_cols = user_df.columns[user_df.dtypes.apply(lambda x: x in ['int64', 'float64'])].to_list()
num_cols

['review_count',
 'useful',
 'funny',
 'cool',
 'fans',
 'average_stars',
 'compliment_hot',
 'compliment_more',
 'compliment_profile',
 'compliment_cute',
 'compliment_list',
 'compliment_note',
 'compliment_plain',
 'compliment_cool',
 'compliment_funny',
 'compliment_writer',
 'compliment_photos',
 'elite_count',
 'friends_count',
 'membership_days']

In [20]:
scaler = StandardScaler()
scaled = scaler.fit_transform(user_df.loc[:, num_cols])

Put scaled values back to original data frame:

In [21]:
# drop original features first
user_df.drop(num_cols, axis=1, inplace=True)
# put in scaled values
user_clean_df = pd.concat([user_df, pd.DataFrame(scaled, columns=num_cols, index=user_df.index)], axis=1)

In [22]:
user_clean_df.shape

(1532223, 22)

In [23]:
user_clean_df.head(3)

Unnamed: 0,user_id,name,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,elite_count,friends_count,membership_days
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,6.06481,9.945867,2.699783,9.392911,12.967617,0.197086,3.021775,4.490039,3.23618,4.362515,1.580633,3.388603,6.378814,4.315452,4.315452,6.666662,1.674484,-0.182042,96.049122,2.455114
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,46.827381,59.731661,28.437743,42.913078,153.332297,0.041537,13.93533,18.311122,10.856576,12.253229,22.129803,27.155924,53.51618,29.115568,29.115568,42.620595,18.240911,12.982835,29.506322,1.805462
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,6.934875,2.825086,2.15699,1.533694,2.456155,-0.342759,1.058554,0.8785,0.577902,1.315606,0.257725,0.945646,0.701076,1.075797,1.075797,0.945443,0.1548,3.86869,2.082895,1.968986


### Save Final Data Frame

In [24]:
user_clean_df.reset_index(drop=True).to_feather('data/yelp_user_cleaned.feather')

In [25]:
user_clean_df.shape

(1532223, 22)