In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

To use Git LFS, you must install the Git LFS client.

See https://git-lfs.github.com for more information.

Run: `git lfs install`

In [3]:
philly_users = pd.read_feather('user_philly.feather')

In [4]:
bucketed_philly_share_of_reviews = []

for i in philly_users['philly_share_of_reviews']:
    if .75 < i:
        bucketed_philly_share_of_reviews.append('more_than_75_percent')
    elif .5 < i <= .75:
        bucketed_philly_share_of_reviews.append('more_than_50_up_to_75_percent')
    elif .25 < i <= .5:
        bucketed_philly_share_of_reviews.append('more_than_25_up_to_50_percent')
    elif i <= .25:
        bucketed_philly_share_of_reviews.append('less_than_or_equal_to_25_percent')

In [5]:
np.unique(np.array(bucketed_philly_share_of_reviews), return_counts=True)

(array(['less_than_or_equal_to_25_percent',
        'more_than_25_up_to_50_percent', 'more_than_50_up_to_75_percent',
        'more_than_75_percent'], dtype='<U32'),
 array([124651,  56858,  17534,  46368], dtype=int64))

In [6]:
philly_users['bucketed_philly_share_of_reviews'] = np.array(bucketed_philly_share_of_reviews)
philly_users

Unnamed: 0,index,_id,user_id,name,review_count,yelping_since,useful,funny,cool,elite,...,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,gender_score,philly_reviews,philly_share_of_reviews,bucketed_philly_share_of_reviews
0,4,631ea1f7abab926ea88770d7,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...",...,1847,7054,3131,3131,1521,1946,0.995754,13,0.003000,less_than_or_equal_to_25_percent
1,6,631ea1f7abab926ea88770d9,AUi8MPWJ0mLkMfwbui27lg,John,109,2010-01-07 18:32:04,154,20,23,,...,1,6,3,3,0,0,0.995785,2,0.018349,less_than_or_equal_to_25_percent
2,12,631ea1f7abab926ea88770df,1McG5Rn_UDkmlkZOrsdptg,Teresa,7,2009-05-26 16:11:11,18,3,13,,...,1,0,2,2,0,0,0.002806,5,0.714286,more_than_50_up_to_75_percent
3,21,631ea1f7abab926ea88770e8,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1221,2005-03-14 20:26:35,14953,9940,11211,200620072008200920102011201220132014,...,1212,5696,2543,2543,815,323,0.003043,4,0.003276,less_than_or_equal_to_25_percent
4,22,631ea1f7abab926ea88770e9,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,...,232,844,467,467,239,180,0.986824,3,0.005128,less_than_or_equal_to_25_percent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245406,1987805,631ea2f2abab926ea8a5c5b0,Tr2yeddopDzMhSb7jbdNeA,Heather,1,2017-07-25 19:31:30,0,0,0,,...,0,0,0,0,0,0,0.002816,1,1.000000,more_than_75_percent
245407,1987831,631ea2f2abab926ea8a5c5ca,XTWARBzLbiJKQ1JeJB9f3g,Sakina,6,2017-11-24 03:17:11,1,0,0,,...,0,0,0,0,0,0,0.000000,1,0.166667,less_than_or_equal_to_25_percent
245408,1987832,631ea2f2abab926ea8a5c5cb,iZ0puydkFQlaSQAXbGtv1g,Dr. Sylvia,1,2014-07-08 19:54:19,2,0,0,,...,0,0,0,0,0,0,0.500000,1,1.000000,more_than_75_percent
245409,1987859,631ea2f2abab926ea8a5c5e6,K-BNvWUCWEGUuBirCU4mmg,Jodi,2,2015-03-23 17:29:22,0,0,0,,...,0,0,0,0,0,0,0.011541,1,0.500000,more_than_25_up_to_50_percent


In [7]:
bucketed_gender_scores = []

for i in philly_users['gender_score']:
    if .8 < i:
        bucketed_gender_scores.append('more_than_80_percent')
    elif .6 < i <= .8:
        bucketed_gender_scores.append('more_than_60_up_to_80_percent')
    elif .4 < i <= .6:
        bucketed_gender_scores.append('more_than_40_up_to_60_percent')
    elif .2 < i <= .4:
        bucketed_gender_scores.append('more_than_20_up_to_40_percent')
    elif i <= .2:
        bucketed_gender_scores.append('less_than_or_equal_to_20_percent')

In [8]:
np.unique(np.array(bucketed_gender_scores), return_counts=True)

(array(['less_than_or_equal_to_20_percent',
        'more_than_20_up_to_40_percent', 'more_than_40_up_to_60_percent',
        'more_than_60_up_to_80_percent', 'more_than_80_percent'],
       dtype='<U32'),
 array([118185,   4682,  29206,   2906,  90432], dtype=int64))

In [9]:
philly_users['bucketed_gender_scores'] = np.array(bucketed_gender_scores)
philly_users

Unnamed: 0,index,_id,user_id,name,review_count,yelping_since,useful,funny,cool,elite,...,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,gender_score,philly_reviews,philly_share_of_reviews,bucketed_philly_share_of_reviews,bucketed_gender_scores
0,4,631ea1f7abab926ea88770d7,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...",...,7054,3131,3131,1521,1946,0.995754,13,0.003000,less_than_or_equal_to_25_percent,more_than_80_percent
1,6,631ea1f7abab926ea88770d9,AUi8MPWJ0mLkMfwbui27lg,John,109,2010-01-07 18:32:04,154,20,23,,...,6,3,3,0,0,0.995785,2,0.018349,less_than_or_equal_to_25_percent,more_than_80_percent
2,12,631ea1f7abab926ea88770df,1McG5Rn_UDkmlkZOrsdptg,Teresa,7,2009-05-26 16:11:11,18,3,13,,...,0,2,2,0,0,0.002806,5,0.714286,more_than_50_up_to_75_percent,less_than_or_equal_to_20_percent
3,21,631ea1f7abab926ea88770e8,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1221,2005-03-14 20:26:35,14953,9940,11211,200620072008200920102011201220132014,...,5696,2543,2543,815,323,0.003043,4,0.003276,less_than_or_equal_to_25_percent,less_than_or_equal_to_20_percent
4,22,631ea1f7abab926ea88770e9,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,...,844,467,467,239,180,0.986824,3,0.005128,less_than_or_equal_to_25_percent,more_than_80_percent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245406,1987805,631ea2f2abab926ea8a5c5b0,Tr2yeddopDzMhSb7jbdNeA,Heather,1,2017-07-25 19:31:30,0,0,0,,...,0,0,0,0,0,0.002816,1,1.000000,more_than_75_percent,less_than_or_equal_to_20_percent
245407,1987831,631ea2f2abab926ea8a5c5ca,XTWARBzLbiJKQ1JeJB9f3g,Sakina,6,2017-11-24 03:17:11,1,0,0,,...,0,0,0,0,0,0.000000,1,0.166667,less_than_or_equal_to_25_percent,less_than_or_equal_to_20_percent
245408,1987832,631ea2f2abab926ea8a5c5cb,iZ0puydkFQlaSQAXbGtv1g,Dr. Sylvia,1,2014-07-08 19:54:19,2,0,0,,...,0,0,0,0,0,0.500000,1,1.000000,more_than_75_percent,more_than_40_up_to_60_percent
245409,1987859,631ea2f2abab926ea8a5c5e6,K-BNvWUCWEGUuBirCU4mmg,Jodi,2,2015-03-23 17:29:22,0,0,0,,...,0,0,0,0,0,0.011541,1,0.500000,more_than_25_up_to_50_percent,less_than_or_equal_to_20_percent


In [10]:
# feather format to compress
philly_users.to_feather('user_philly.feather', compression='zstd')