In [4]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 30

### Content Based Rec Sys Consideration

In [5]:
#read in both group and group_topic table
df_groups = pd.read_csv('groups.csv')
df_groups_topics = pd.read_csv('groups_topics.csv', encoding = 'latin-1')

In [6]:
#all the states the groups are in from the dataset
df_groups.state.unique()

array(['NY', 'IL', 'CA', 'NJ'], dtype=object)

In [8]:
#theres less organizer than groups, thus some user create multiple groups
len(df_groups['organizer.member_id'].unique()), len(df_groups['group_id'].unique())

(11834, 16330)

In [9]:
#There is a total of 961 unique topic tags for groups
df_groups_topics['topic_key'].unique().shape, df_groups_topics['topic_name'].unique().shape

((961,), (961,))

In [10]:
df_groups_topics.head()

Unnamed: 0,topic_id,topic_key,topic_name,group_id
0,83,sportsfans,Sports Fan,241031
1,83,sportsfans,Sports Fan,289172
2,83,sportsfans,Sports Fan,295444
3,83,sportsfans,Sports Fan,1040320
4,83,sportsfans,Sports Fan,1403055


In [11]:
#dropping unecessary columns
df_groups = df_groups.drop(['country','created', 'description', 'organizer.name', 'rating',
 'group_photo.base_url', 'group_photo.highres_link', 'group_photo.photo_id','group_photo.photo_link','group_photo.thumb_link', 'group_photo.type','join_mode','lat','link',
 'lon','members', 'organizer.photo.base_url','organizer.photo.highres_link', 'organizer.photo.photo_id','organizer.photo.photo_link',
 'organizer.photo.thumb_link','organizer.photo.type','city', 'city_id', 'category_id','category.name', 'timezone','urlname','utc_offset','visibility','who'], axis = 1)

In [12]:
#left outer join with groups and their topics because some groups do not have any tags
df_groups_topics = df_groups_topics.drop(['topic_name','topic_id'], axis = 1)
df_groups = df_groups.merge(df_groups_topics, how = 'left', on = 'group_id')

In [13]:
#number of unique groups, number of rows in dataframe
len(df_groups['group_id'].unique()), len(df_groups)

(16330, 35666)

In [14]:
df_groups.head()

Unnamed: 0,group_id,category.shortname,group_name,organizer.member_id,state,topic_key
0,6388,health-wellbeing,Alternative Health NYC,1513133,NY,wellness
1,6510,community-environment,Alternative Energy Meetup,3955940,NY,environmental-education
2,8458,pets-animals,NYC Animal Rights,1809940,NY,socialnetwork
3,8458,pets-animals,NYC Animal Rights,1809940,NY,social
4,8458,pets-animals,NYC Animal Rights,1809940,NY,pet-cats-and-kittens


In [15]:
#group_id must also be of category type or else rows will be missing after pivoting table
df_groups['group_id'] = df_groups['group_id'].astype('category')

#creating a binary representation utility matrix
temp = df_groups.drop(['group_name','organizer.member_id', 'state'], axis = 1)
temp['dummy'] = 1
temp = pd.pivot_table(temp, index= 'group_id', columns = ['topic_key'], values = 'dummy', fill_value = 0)

In [16]:
#to setup dataframe x that will allow me to map back index to the group title
x = df_groups.drop_duplicates(subset = 'group_id')
x = x.set_index('group_id')

#joining the category shortname with the pivot table and using get dummies for one hot encoding to have only one level
#of columns
temp = pd.get_dummies(temp.join(x['category.shortname']), prefix = '', prefix_sep = '')

In [48]:
temp.head()

Unnamed: 0_level_0,20s-30s-social,20s-social,3-d-paper-crafts,80s-dancing,90s-music,Euro-games,a-spiritual-path-to-higher-creativity,acting-and-writing,active-parents,acustical-music-music-jam-sessions-vinyl-records,adaptive-technology,adoptive-parents,adult-education,adult-paper-craft-classes,adultent,...,music,new-age-spirituality,outdoors-adventure,paranormal,parents-family,pets-animals,photography,religion-beliefs,sci-fi-fantasy,singles,socializing,sports-recreation,support,tech,writing
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
6388,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6510,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8458,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
10104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [17]:
#compute the similarity between groups
from sklearn.metrics.pairwise import cosine_similarity
sim_scores = cosine_similarity(temp)

In [18]:
#reset index to map back group_id recommendation to group names
x = x.reset_index(drop = True)

#created to retrieve idx from input group name of content base function
y = pd.Series(x.index, index = x['group_name'])

In [49]:
y.head()

group_name
Alternative Health NYC           0
Alternative Energy Meetup        1
NYC Animal Rights                2
The New York City Anime Group    3
NYC Pit Bull Group               4
dtype: int64

In [36]:
def content_base(group_name, scores = sim_scores):
    """This function takes in a group name as a string and returns a list of groups that are most similar to the input."""
    
    #grabbing the index of the group
    idx = y[group_name]
    
    #find the row of the score matrix using the index and using enumerate to keep track the index of the group similiar
    #to it
    row_score = list(enumerate(scores[idx]))
    
    #sort the row score by similarity score from highest to lowest
    sorted_score = sorted(row_score, key = lambda p: p[1], reverse = True)
    
    #get the indices of the sorted score
    #West NY is located in the state of NJ
    if x.loc[idx, 'state'] in ['NY', 'NJ']:
        indices = [i[0] for i in sorted_score if x.loc[i[0],'state'] in ['NY', 'NJ']]
        score = [i[1] for i in sorted_score if x.loc[i[0],'state'] in ['NY', 'NJ']]
    
    #condition on the fact that only if similar groups are from the same state
    else:    
        indices = [i[0] for i in sorted_score if x.loc[idx,'state'] == x.loc[i[0], 'state']]
        score = [i[1] for i in sorted_score if x.loc[idx,'state'] == x.loc[i[0], 'state']]
    
    #return the top five most similar groups
    return list(zip(list(x.iloc[indices[1:6],1]), score))

In [37]:
content_base('NYC Pit Bull Group')

[('Harlem Meer Mutts Club', 0.99999999999999978),
 ('Brooklyn disk dogs', 0.99999999999999978),
 ('NY & NJ(Waterfront) Miniature Pinscher Meetup', 0.99999999999999978),
 ('Me & My Best Friend Hiking Adventures', 0.99999999999999978),
 ('THE NEW YORK CITY MALTESE MEETUP GROUP', 0.99999999999999978)]

In [38]:
content_base('Alternative Health NYC')

[('The New York Chakra Healing Meetup', 0.99999999999999978),
 ('Living Energy~ Global & Local Wellness Community', 0.99999999999999978),
 ('The Herbalists Meetup Group', 0.99999999999999978),
 ('Say YES to Your Life, Manhattan!', 0.99999999999999978),
 ('Cosmos Tree', 0.99999999999999978)]

In [39]:
content_base('NYC Animal Rights')

[('The NYC Pug Meetup Group', 0.99999999999999989),
 ('New York City Basenji Meetup', 0.67082039324993692),
 ('NYC Shiba Inus and their Human Companions(NE Shibaholics).',
  0.63245553203367577),
 ('The New York City Cat Meetup Group', 0.63245553203367577),
 ('Westie Rescue NYC (WRNYC)', 0.63245553203367577)]

In [40]:
content_base('Pears and Bees')

[('New York Food Explorers', 0.99999999999999978),
 ('Third Culture Kids (TCKs) of NYC', 0.77151674981045959),
 ('NYC Food & Travel Enthusiasts', 0.67612340378281321),
 ('New York Wine Events Lovers', 0.67612340378281321),
 ('Eating in NYC', 0.66815310478106082)]

In [41]:
content_base('GRE Practice')

[('Greek Language & Culture Meetup (NYC)', 1.0),
 ("Massimo's Philosophy Caf", 1.0),
 ('NYC Debate', 1.0),
 ('Economist readers', 1.0),
 ('The NYC Physics and Astronomy Meetup', 1.0)]

In [42]:
content_base('Just Dance NYC')

[('The New York City Hiking Group', 1.0),
 ("NYC Sea Gypsies - New York City's Dive Club", 0.70710678118654746),
 ('Appalachian Mountain Club - Young Members', 0.70710678118654746),
 ('Veggie Hikers, Climbers and Skiers', 0.70710678118654746),
 ('NYC Fun Cyclers', 0.70710678118654746)]

In [43]:
content_base('Chicago Meetup')

[('Chicago Culture and Arts Club', 1.0),
 ("BYOT (Bring Your Own Theater) Chicago's best 24hr Theater",
  0.70710678118654746),
 ("Chicago Women's Creative Collective", 0.6804138174397717),
 ('1001 Things to See in Chicago Before You Die', 0.60302268915552726),
 ("Chicago The Artist's Way Meetup Group", 0.59628479399994383)]

In [44]:
content_base('Vocabulary workhop')

[('Professional English Workshops for Internationals', 1.0),
 ('Free English Classes in San Francisco', 0.86602540378443882),
 ('SF ESL Group', 0.86602540378443882),
 ('Startup Professionals Meetup for Non-Native English Speakers', 0.75),
 ('San Francisco Pronunciation for ESL Professionals Meetup', 0.75)]

In [47]:
content_base('San Francisco Brazilian Jiu-Jitsu Meetup')

[('Martial arts sparring strategy and drilling', 1.0000000000000002),
 ('Street Smart Safety & Self Defense For Women', 0.86602540378443882),
 ('Warrior Brothers - San Francisco', 0.81649658092772603),
 ('UFC Watch Club', 0.81649658092772603),
 ('San Francisco Self-Defense Meetup', 0.81649658092772603)]