In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from collections import Counter
pd.options.display.max_columns = 30

In [5]:
df_members = pd.read_csv('members.csv', encoding = 'latin-1')
df_groups = pd.read_csv('groups.csv')

In [6]:
len(df_members)

5893886

In [7]:
df_members.city.unique()

array(['New York', 'San Francisco', 'Chicago', 'South San Francisco',
       'West New York', 'Chicago Heights', 'West Chicago', 'Chicago Ridge',
       'East Chicago', 'North Chicago', 'san francisco', 'New York Mills',
       'Chicago Park'], dtype=object)

In [8]:
list(df_members.columns)

['member_id',
 'bio',
 'city',
 'country',
 'hometown',
 'joined',
 'lat',
 'link',
 'lon',
 'member_name',
 'state',
 'member_status',
 'visited',
 'group_id']

In [9]:
#dropping unecessary groups from members dataframe
df_members = df_members.drop(['bio','country','hometown','joined','lat','link','lon','member_name','state','member_status','visited'], axis = 1)

In [10]:
list(df_groups.columns)

['group_id',
 'category_id',
 'category.name',
 'category.shortname',
 'city_id',
 'city',
 'country',
 'created',
 'description',
 'group_photo.base_url',
 'group_photo.highres_link',
 'group_photo.photo_id',
 'group_photo.photo_link',
 'group_photo.thumb_link',
 'group_photo.type',
 'join_mode',
 'lat',
 'link',
 'lon',
 'members',
 'group_name',
 'organizer.member_id',
 'organizer.name',
 'organizer.photo.base_url',
 'organizer.photo.highres_link',
 'organizer.photo.photo_id',
 'organizer.photo.photo_link',
 'organizer.photo.thumb_link',
 'organizer.photo.type',
 'rating',
 'state',
 'timezone',
 'urlname',
 'utc_offset',
 'visibility',
 'who']

In [11]:
#dropping unecessary columns from groups
df_groups = df_groups.drop(['category_id','category.name','category.shortname','city_id',
 'country','created','description','group_photo.base_url','group_photo.highres_link',
 'group_photo.photo_id','group_photo.photo_link','group_photo.thumb_link','group_photo.type','join_mode','lat','link',
 'lon','members','organizer.member_id','organizer.name','organizer.photo.base_url','organizer.photo.highres_link',
 'organizer.photo.photo_id','organizer.photo.photo_link','organizer.photo.thumb_link','organizer.photo.type','rating','state','timezone','urlname','utc_offset','visibility','who'], axis = 1)

In [12]:
def get_city(city):
    """Take in the acronym of the city and return a dataframe containing only members and groups from that city."""
    
    #defense coding
    if city not in ['SF', 'NYC', 'CHI']:
        raise ValueError('Invalid City')
        
    #separating the members and groups by their corresponding city
    
    #SF
    sf = ['San Francisco', 'san francisco', 'South San Francisco']
    df_sf = df_members[df_members.city.isin(sf)]
    df_sfg = df_groups[df_groups.city.isin(sf)]
    
    #NY
    ny = ['New York', 'West New York', 'New York Mills']
    df_ny = df_members[df_members.city.isin(ny)]
    df_nyg = df_groups[df_groups.city.isin(ny)]
    
    #CHI
    chi = ['Chicago','Chicago Heights','West Chicago','Chicago Ridge','East Chicago','North Chicago','Chicago Park']
    df_chi = df_members[df_members.city.isin(chi)]
    df_chig = df_groups[df_groups.city.isin(chi)]
    
    if city == 'SF':
        return df_sf.merge(df_sfg, on = 'group_id')
    elif city == 'NYC':
        return df_ny.merge(df_nyg, on = 'group_id')
    elif city == 'CHI':
        return df_chi.merge(df_chig, on = 'group_id')

In [15]:
df_map = get_city('CHI')
df_map.shape

(1191548, 5)

In [16]:
df_map.head()

Unnamed: 0,member_id,city_x,group_id,city_y,group_name
0,819,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
1,4295,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
2,10600,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
3,152870,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
4,300185,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"


In [17]:
temp = df_map.drop(['group_id','city_x','city_y'], axis = 1)
temp['dummy'] = 1
temp = pd.pivot_table(temp, index = 'member_id', columns = 'group_name', values = 'dummy', fill_value = 0)

In [18]:
#utility matrix
temp.head()

group_name,!Happy Hour Friends and Fun! 20ish to 40ish,#Idea2MVP Chicago,#Resist: Chicago,&UX Chicago,'Infrastructure as Code' in Chicago,(23-35) Uptown/Edgewater Social Board Game Group,(CHIBUG) Chicago Bluebeam User Group,(Girl) Friends of Chicago,*Chicagoans* help Haiti,100% Real Cheese,1800-Photographers by 1800-STUDIOS,1871 Hackers,20 & 30 somethings: Chicago Gets Fit & Free,20 and 30 Somethings Chicago,20's in Algorithmic Trading Chicago,...,beFAB LAB,chiDUXX: Chicago Women of Design & UX,chiRUG | Chicago Revit User Group,djembe drumming group,eXtreme Programming (XP) Chicago,edX Chicago Community,girls who attract attention dancing in Chicago,"gluten free, vegan & french style in Chicago",hpiChicago Fitness & Wellness,i like Red and White wine,iCoach360 formerly Coaching and Vino,logan square movie night meetup/discussion,"responsiveX University - Cloud, Azure, Mobile, Web, Client",the Chicago ITALIAN conversation group by ITALIAMO,weconnect Chicago Startups and New Tech
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3705,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4295,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
#the number of members are equal in both table
temp.shape, df_map['member_id'].unique().shape

((234725, 2415), (234725,))

In [20]:
#setting member_id as the index to map back to the group name
df_map = df_map.set_index('member_id')
df_map.head()

Unnamed: 0_level_0,city_x,group_id,city_y,group_name
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
819,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
4295,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
10600,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
152870,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
300185,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"


In [21]:
df_map.shape

(1191548, 4)

### Nearest Neighbors Collaborative Filtering

In [22]:
def collab_filtering(member_id, metric, utility):
    """Takes in an unique member_id, utility matrix, and the metric for similarity computation as a string 
       and finds ten most similar users using the cosine similarity and returns a list of 
       tuple of groups the input user should consider joinning based on other similar user preferences. """
    
    #instantiate nearest neighbors number and setup the neighborhood using sklearn library
    knn = NearestNeighbors(n_neighbors = 11, metric = metric, algorithm = 'brute')
    knn.fit(utility)
    
    #retreiving the nearest neighbors indexes of the input user
    neighbors = knn.kneighbors(utility.loc[member_id].values.reshape(1,-1),return_distance = False)
    
    #create a list of all the groups the input user is in
    user_groups = list(df_map.loc[member_id,'group_name'].unique())
    
    #create a list of all the groups the neighbors are in (i.e) df_map.loc[[member_id], group_name]
    neighbor_groups = list(df_map.loc[list(utility.iloc[list(neighbors.squeeze())].index)[1:], 'group_name'])

    #recommend will be a list of all the groups that the input user is not in ordered by the number of similar users who
    #are also in the group
    recommend = []

    for group in neighbor_groups:
        if group not in user_groups:
            recommend.append(group)
            
    recommend = Counter(recommend)
    recommend = sorted(recommend.items(), key= lambda d:d[1], reverse = True)
    return recommend

In [23]:
collab_filtering(819, 'cosine', temp)

[('The Chicago Emerging Technology Meetup', 1),
 ('Chicago .NET Developers (downtown)', 1),
 ('Built in Chicago', 1),
 ('Chicago Functional Programming', 1),
 ('Sitecore User Group - Chicago', 1)]

In [24]:
df_map.loc[819]

Unnamed: 0_level_0,city_x,group_id,city_y,group_name
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
819,Chicago,514628,Chicago,"Chicago Droid: Machine Learning, IoT"
819,Chicago,1294245,Chicago,Chicago JavaScript Meetup Group
819,Chicago,1455470,Chicago,Chicago C/C++ Users Group
819,Chicago,1576866,Chicago,Augmented Reality Chicago
819,Chicago,1681402,Chicago,Chicago Machine Learning Study Group
819,Chicago,1780576,Chicago,Chicago Node.js
819,Chicago,2415292,Chicago,Chicago HTML5
819,Chicago,3314362,Chicago,Chicago Software Craftsmanship
819,Chicago,5531942,Chicago,Chicago Pythonistas
819,Chicago,5912332,Chicago,AngularJS Chicago


In [25]:
df_map = get_city('SF')

In [26]:
temp1 = df_map.drop(['group_id','city_x','city_y'], axis = 1)
temp1['dummy'] = 1
temp1 = pd.pivot_table(temp1, index = 'member_id', columns = 'group_name', values = 'dummy', fill_value = 0)

In [27]:
df_map = df_map.set_index('member_id')
df_map.head()

Unnamed: 0_level_0,city_x,group_id,city_y,group_name
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
65,San Francisco,2701562,San Francisco,GoSF
29009,San Francisco,2701562,San Francisco,GoSF
67424,San Francisco,2701562,San Francisco,GoSF
94707,San Francisco,2701562,San Francisco,GoSF
95826,San Francisco,2701562,San Francisco,GoSF


In [28]:
collab_filtering(65, 'cosine', temp1)

[('Wercker SF', 1),
 ('Bay Area Mesos User Group', 1),
 ('Bot Builder MeetUp', 1),
 ('The San Francisco Redis Meetup Group', 1),
 ('Docker San Francisco', 1),
 ('San Francisco Perl', 1),
 ('SF Data Engineering', 1),
 ('DART', 1)]

In [29]:
df_map.loc[65]

Unnamed: 0_level_0,city_x,group_id,city_y,group_name
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
65,San Francisco,2701562,San Francisco,GoSF
65,San Francisco,14177122,San Francisco,Sourcegraph Tech Talks
65,San Francisco,14638342,San Francisco,San Francisco CoreOS Meetup
