In [1]:
import pandas as pd
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load the datasets
events = pd.read_json("../dataset/events.json")

In [3]:
events.head(1)

Unnamed: 0,event_id,title,description,tags,type,location,organizationName,isPaid,price
0,68e0d68abee8a0e226e6fe96,Versatile homogeneous interface,Option their account pattern center challenge....,"[web development, AI]",Forum,Jaclynborough,Lopez and Sons,False,0.0


In [4]:
#combining all strings column
events['Keywords']= events['title'] + '_' + events['location'] + '_' + events['organizationName'] + '_' + events['type']

In [5]:
#deleting all these added columns
events.drop(columns=['title', 'location', 'organizationName', 'type', 'isPaid', 'price'], inplace=True)

In [6]:
#converting the string column into a list column
events['Keywords'] = events['Keywords'].apply(lambda x: x.split('_'))

In [7]:
#combining the keywords and tags columns
#then deleting them
events['keywords']= events['Keywords'] + events['tags'] 
events.drop(columns=['Keywords', 'tags'], inplace=True)

In [8]:
#deleting the isPaid column (already deleted above, so this cell can be skipped or left empty)

In [9]:
events.head(5)

Unnamed: 0,event_id,description,keywords
0,68e0d68abee8a0e226e6fe96,Option their account pattern center challenge....,"[Versatile homogeneous interface, Jaclynboroug..."
1,68e0d68abee8a0e226e6fe97,Evening whom much could. Send rock important i...,"[Intuitive empowering orchestration, New Maria..."
2,68e0d68abee8a0e226e6fe98,Maintain however soldier happen answer already...,"[Reduced content-based product, Port Alexis, S..."
3,68e0d68abee8a0e226e6fe99,Inside stop bit student many identify internat...,[Object-based intermediate artificial intellig...
4,68e0d68abee8a0e226e6fe9a,Available ground treat film maybe himself pres...,"[Grass-roots global hub, New Chrisport, Fletch..."


In [10]:
#converting the description column into a list column
events['description'] = events['description'].apply(lambda x: x.split(' '))

In [11]:
#combining the keywords and description columns
#then deleting them
events['key']= events['keywords'] + events['description']
events.drop(columns=['keywords', 'description'], inplace=True)

In [12]:
events

Unnamed: 0,event_id,key
0,68e0d68abee8a0e226e6fe96,"[Versatile homogeneous interface, Jaclynboroug..."
1,68e0d68abee8a0e226e6fe97,"[Intuitive empowering orchestration, New Maria..."
2,68e0d68abee8a0e226e6fe98,"[Reduced content-based product, Port Alexis, S..."
3,68e0d68abee8a0e226e6fe99,[Object-based intermediate artificial intellig...
4,68e0d68abee8a0e226e6fe9a,"[Grass-roots global hub, New Chrisport, Fletch..."
...,...,...
415,68e0d79a3048252621b7d95c,"[Extended full-range open system, New Carrie, ..."
416,68e0d79a3048252621b7d95d,"[Polarized discrete benchmark, Lake Stacyport,..."
417,68e0d79a3048252621b7d95e,"[Proactive reciprocal challenge, Walkerview, S..."
418,68e0d79a3048252621b7d95f,"[Grass-roots secondary forecast, Andrewberg, M..."


In [13]:
#converting list to string for vectorization
events['key']= events['key'].apply(lambda x:" ".join(x) if isinstance(x, list) else str(x))

In [14]:
events

Unnamed: 0,event_id,key
0,68e0d68abee8a0e226e6fe96,Versatile homogeneous interface Jaclynborough ...
1,68e0d68abee8a0e226e6fe97,Intuitive empowering orchestration New Mariabe...
2,68e0d68abee8a0e226e6fe98,Reduced content-based product Port Alexis Ston...
3,68e0d68abee8a0e226e6fe99,Object-based intermediate artificial intellige...
4,68e0d68abee8a0e226e6fe9a,Grass-roots global hub New Chrisport Fletcher-...
...,...,...
415,68e0d79a3048252621b7d95c,Extended full-range open system New Carrie Whi...
416,68e0d79a3048252621b7d95d,Polarized discrete benchmark Lake Stacyport Mo...
417,68e0d79a3048252621b7d95e,Proactive reciprocal challenge Walkerview Scot...
418,68e0d79a3048252621b7d95f,Grass-roots secondary forecast Andrewberg More...


In [15]:
#lowercase and remove punctuation
events['key'] = events['key'].str.lower()
events['key'] = events['key'].str.translate(str.maketrans('', '', string.punctuation))

In [16]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [17]:
event_vector = cv.fit_transform(events['key']).toarray()

In [18]:
event_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
event_similarity = cosine_similarity(event_vector)

In [20]:
list(enumerate(event_similarity[0]))

[(0, np.float64(1.0)),
 (1, np.float64(0.0)),
 (2, np.float64(0.04761904761904762)),
 (3, np.float64(0.03688555567816587)),
 (4, np.float64(0.0)),
 (5, np.float64(0.07715167498104596)),
 (6, np.float64(0.09100315103865803)),
 (7, np.float64(0.18200630207731605)),
 (8, np.float64(0.0)),
 (9, np.float64(0.0)),
 (10, np.float64(0.043643578047198484)),
 (11, np.float64(0.15877683720748895)),
 (12, np.float64(0.09304842103984708)),
 (13, np.float64(0.05006261743217588)),
 (14, np.float64(0.0)),
 (15, np.float64(0.09523809523809525)),
 (16, np.float64(0.04550157551932901)),
 (17, np.float64(0.12598815766974242)),
 (18, np.float64(0.0)),
 (19, np.float64(0.0)),
 (20, np.float64(0.08728715609439697)),
 (21, np.float64(0.05006261743217588)),
 (22, np.float64(0.0545544725589981)),
 (23, np.float64(0.0)),
 (24, np.float64(0.03919309008348103)),
 (25, np.float64(0.0)),
 (26, np.float64(0.1283881477532739)),
 (27, np.float64(0.10286889997472795)),
 (28, np.float64(0.0)),
 (29, np.float64(0.14638501

In [21]:
def recommend_event(event_id):
    matches = events[events['event_id'] == event_id]
    if matches.empty:
        print(f"Event ID {event_id} not found.")
        return
    index = matches.index[0]
    distances = event_similarity[index]
    events_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in events_list:
        print(events.iloc[i[0]].event_id)
    return

In [22]:
recommend_event('68e0d68abee8a0e226e6fe96')

68e0d7821f7dc4e22f39d565
68e0d79a3048252621b7d8c3
68e0d79a3048252621b7d90d
68e0d7821f7dc4e22f39d55a
68e0d79a3048252621b7d906


In [23]:
import os
import pickle
os.makedirs('../model', exist_ok=True)
pickle.dump(event_similarity, open('../model/events_similarity.pkl', 'wb'))

In [24]:
pickle.dump(events, open('../model/events.pkl', 'wb'))

In [25]:
pickle.dump(event_vector, open('../model/events_vector.pkl', 'wb'))