In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import sys
import math
import time

In [2]:
# Reads the data from CSV files, each attribute column can be obtained via its name, e.g., y = data['y']
def getDataframe(filePath):
    data = pd.read_csv(filePath)
    return data

In [3]:
users_df = getDataframe('users.csv')
features = users_df.columns.values

In [4]:
# find number of unique values for each feature
for feature in features:
    print(feature)
    print(len(users_df[feature].unique()))

average_stars
375
compliment_cool
445
compliment_cute
100
compliment_funny
445
compliment_hot
367
compliment_list
71
compliment_more
127
compliment_note
294
compliment_photos
239
compliment_plain
426
compliment_profile
109
compliment_writer
271
cool
803
elite
1280
fans
251
friends
28903
funny
716
name
9173
review_count
885
useful
1052
user_id
41720
yelping_since
4284


In [5]:
users_df.shape

(41720, 22)

In [6]:
# get frequency of values for each feature
for feature in features:
    print('---------------------------------')
    print(feature)
    print(users_df[feature].value_counts(dropna=False))

---------------------------------
average_stars
5.00    7131
1.00    2149
4.00    2056
3.00    1893
3.67    1011
4.50     728
2.00     686
3.50     676
4.33     547
4.20     423
4.67     416
2.33     409
3.33     390
3.75     354
4.25     346
3.80     300
3.40     295
3.86     273
3.83     273
2.50     273
3.60     247
4.75     244
4.13     237
3.88     230
4.43     229
2.67     225
3.71     219
4.14     213
3.25     212
4.40     206
        ... 
4.98       2
1.36       2
1.74       2
2.34       2
1.09       1
1.97       1
1.65       1
1.30       1
2.02       1
2.01       1
1.61       1
1.68       1
1.87       1
1.62       1
1.96       1
1.31       1
1.59       1
1.32       1
1.99       1
1.66       1
1.42       1
1.45       1
1.72       1
1.07       1
2.49       1
1.58       1
1.53       1
1.18       1
1.23       1
1.19       1
Name: average_stars, Length: 375, dtype: int64
---------------------------------
compliment_cool
0        33720
1         2796
2          998
3          558
4 

None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [7]:
# drop features
elements_to_delete = ['compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'friends', 'name']
users_df.drop(elements_to_delete, inplace=True, axis=1)
users_df.columns.values

array(['average_stars', 'cool', 'elite', 'fans', 'funny', 'review_count',
       'useful', 'user_id', 'yelping_since'], dtype=object)

In [8]:
# factorize elite feature
feature_values = users_df['elite'].value_counts(dropna=False).index.values
labels, uniques = pd.factorize(feature_values, sort=True)
users_df['elite'].replace(feature_values, labels, inplace=True)
users_df.head()

Unnamed: 0,average_stars,cool,elite,fans,funny,review_count,useful,user_id,yelping_since
0,2.83,0,1279,0,2,6,7,UxfpKHGO2dfQCdS9xLLJow,2012-04-25
1,3.0,0,1279,0,0,4,0,Kr5NDQFPPB_01-5CDmSqVg,2015-11-30
2,3.09,0,1279,1,0,10,2,wfoeMtriLwZsdRzcxNTaFA,2012-11-13
3,4.0,0,1279,0,0,4,0,aXb0kCIsIbPEEUSGomrrmA,2014-09-10
4,4.0,0,1279,0,0,1,0,sLrX2KGu3lc_JczAnsg0_Q,2015-03-22


In [9]:
current_seconds = time.time()

# replace yelping_since date with seconds
for index, value in users_df['yelping_since'].iteritems():
    date = time.strptime(value, '%Y-%m-%d')
    seconds = time.mktime(date)
    seconds_since = current_seconds - seconds
    users_df.at[index, 'yelping_since'] = float(seconds_since)

In [10]:
users_df.head()

Unnamed: 0,average_stars,cool,elite,fans,funny,review_count,useful,user_id,yelping_since
0,2.83,0,1279,0,2,6,7,UxfpKHGO2dfQCdS9xLLJow,280536000.0
1,3.0,0,1279,0,0,4,0,Kr5NDQFPPB_01-5CDmSqVg,167003000.0
2,3.09,0,1279,1,0,10,2,wfoeMtriLwZsdRzcxNTaFA,263079000.0
3,4.0,0,1279,0,0,4,0,aXb0kCIsIbPEEUSGomrrmA,205541000.0
4,4.0,0,1279,0,0,1,0,sLrX2KGu3lc_JczAnsg0_Q,188865000.0


In [11]:
# users_df.dtypes
features = ['cool', 'fans', 'funny', 'useful', 'yelping_since']
    
# normalize values for features
for feature in features:
    users_df[feature] = (users_df[feature]-users_df[feature].mean())/users_df[feature].std()

In [12]:
users_df.head()

Unnamed: 0,average_stars,cool,elite,fans,funny,review_count,useful,user_id,yelping_since
0,2.83,-0.055987,1279,-0.14582,-0.051436,6,-0.064633,UxfpKHGO2dfQCdS9xLLJow,0.510489
1,3.0,-0.055987,1279,-0.14582,-0.054095,4,-0.071331,Kr5NDQFPPB_01-5CDmSqVg,-0.875382
2,3.09,-0.055987,1279,-0.102155,-0.054095,10,-0.069417,wfoeMtriLwZsdRzcxNTaFA,0.297403
3,4.0,-0.055987,1279,-0.14582,-0.054095,4,-0.071331,aXb0kCIsIbPEEUSGomrrmA,-0.404959
4,4.0,-0.055987,1279,-0.14582,-0.054095,1,-0.071331,sLrX2KGu3lc_JczAnsg0_Q,-0.608508


In [13]:
users_df.to_csv("users2.csv", encoding='utf-8', index=False)