<a href="https://colab.research.google.com/github/argalusmp/CH2-PS_Recommendation-System/blob/V/Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Collab](https://colab.research.google.com/drive/1d9l2-NXW5traKPQ0j-l4eZ2vSI0mEVvV)

# **Build Recommendation System with Content-Based Filtering**

# Packages

Import Packages



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate

# Load Event dataset
event_data = pd.read_csv("./events_dataset.csv")

event_df = pd.DataFrame(event_data)

# Load User dataset
user_data = pd.read_csv("./users_dataset.csv")

user_df = pd.DataFrame(user_data)

# Preprocess data
def preprocess_event_data(event_df):
     # One-hot encode categorical variables
    event_df = pd.get_dummies(event_df, columns=['Category', 'Location'])

    # Split Qualifications into separate skills
    event_df['Skills'] = event_df['Qualifications'].apply(lambda x: ' '.join(x.lower().split(',')) if pd.notnull(x) else '')
    return event_df[list(event_df.columns[3:])]

def preprocess_user_data(user_df):
    # Convert categorical features to numerical representation

    # Split Skills into separate skills
    user_df['Skills'] = user_df['Skills'].apply(lambda x: ' '.join(x.lower().split(',')) if pd.notnull(x) else '')

    return user_df[['Volunteer Name',  'Gender', 'Skills', 'Location', 'Type of Organization']]

event_df = preprocess_event_data(event_df)
user_df = preprocess_user_data(user_df)

# Create a mapping for skills
mlb = MultiLabelBinarizer()
mlb.fit(event_df['Skills'].explode().unique())

# Transform event and user skills into binary vectors
event_skills = pd.DataFrame(mlb.transform(event_df['Skills']), columns=mlb.classes_)
user_skills = pd.DataFrame(mlb.transform(user_df['Skills']), columns=mlb.classes_)

# Combine the binary vectors with the original dataframes
event_df = pd.concat([event_df, event_skills], axis=1)
user_df = pd.concat([user_df, user_skills], axis=1)

# Drop the original 'Skills' column
event_df.drop('Skills', axis=1, inplace=True)
user_df.drop('Skills', axis=1, inplace=True)





In [None]:
# Build the recommendation model
def build_model():
    # Input layers
    event_input = Input(shape=(event_df.shape[1]-1,), name='event_input')
    user_input = Input(shape=(user_df.shape[1]-1,), name='user_input')

    # Embedding layers for event and user
    event_embedding = Embedding(input_dim=2, output_dim=5, input_length=event_df.shape[1]-1)(event_input)
    user_embedding = Embedding(input_dim=2, output_dim=5, input_length=user_df.shape[1]-1)(user_input)

    # Flatten the embeddings
    event_flatten = Flatten()(event_embedding)
    user_flatten = Flatten()(user_embedding)

    # Concatenate the flattened embeddings
    concat = Concatenate()([event_flatten, user_flatten])

    # Dense layers for the recommendation model
    dense1 = Dense(128, activation='relu')(concat)
    dense2 = Dense(64, activation='relu')(dense1)
    output = Dense(1, activation='sigmoid')(dense2)

    # Create and compile the model
    model = Model(inputs=[event_input, user_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model




In [None]:
# Split the data into training and testing sets
X_event_train, X_event_test, X_user_train, X_user_test, y_train, y_test = train_test_split(
    event_df.drop('Event_id', axis=1).values,
    user_df.drop('Volunteer Name', axis=1).values,
    np.ones(event_df.shape[0]), test_size=0.2, random_state=42
)

# Convert data to NumPy arrays with appropriate data types
X_event_train = np.asarray(X_event_train).astype(np.float32)
X_event_test = np.asarray(X_event_test).astype(np.float32)
X_user_train = np.asarray(X_user_train).astype(np.float32)
X_user_test = np.asarray(X_user_test).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

# Build and train the model
model = build_model()
model.fit(x=[X_event_train, X_user_train], y=y_train, epochs=10, batch_size=32, validation_data=([X_event_test, X_user_test], y_test))


# Make predictions
predictions = model.predict([event_df.drop('Event_id', axis=1).values, user_df.drop('Volunteer Name', axis=1).values])

# Print the predictions
print(predictions)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf

# Import Dataset


In [None]:
user_dataset = pd.read_csv("./users_dataset.csv")
event_dataset= pd.read_csv("./events_dataset.csv")

In [None]:
df_user = pd.DataFrame(user_dataset)
df_event = pd.DataFrame(event_dataset)

In [None]:
print(len(df_user))
print(len(df_event))

2786
2786


In [None]:
skills_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
skills_encoded = skills_encoder.fit_transform(df_user[['Skills']])

In [None]:
location_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
location_encoded = location_encoder.fit_transform(df_event[['Location']])


In [None]:
# One-hot encoding untuk kategori acara
category_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
category_encoded = category_encoder.fit_transform(df_event[['Category']])

In [None]:
print(category_encoded)

In [None]:
print(event_matrix)

In [None]:
# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(df_volunteer[['Skills', 'Location', 'Age']], df_volunteer['Target_Label'], test_size=0.2, random_state=42)

# Preprocessing Data


In [None]:
all_data = pd.merge(user_dataset, event_dataset, how='cross')

In [None]:
# Pisahkan data menjadi train dan test
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

In [None]:
# Preprocessing user data
user_dataset['Skills'] = user_dataset['Skills'].str.lower()
user_dataset['Availability'] = user_dataset['Availability'].str.lower()
user_dataset['Location'] = user_dataset['Location'].str.lower()
user_dataset['Type of Organization'] = user_dataset['Type of Organization'].str.lower()

## Preprocessing event data
event_dataset['Kualifikasi'] = event_dataset['Kualifikasi'].str.lower()
event_dataset['Domisili'] = event_dataset['Domisili'].str.lower()
event_dataset['Kategori'] = event_dataset['Kategori'].str.lower()
event_dataset['Age'] = event_dataset['Age'].str.lower()


## Memisahkan user skill menjadi beberapa kolom terpisah untuk one hot
user_skills_split = user_dataset['Skills'].str.split(', ', expand=True)

## Create one-hot encoding for user skills
user_skills_one_hot = pd.get_dummies(user_skills_split, prefix='Skill')

In [None]:
## Menggabungkan dataset
merged_data = pd.merge(user_dataset, event_dataset, how='cross')
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

## Create one-hot encoding for user and event data
user_one_hot = pd.get_dummies(merged_data[['Age_x', 'Availability', 'Location', 'Type of Organization']], prefix='User')
event_one_hot = pd.get_dummies(merged_data[[ 'Kategori', 'Age_y','Domisili']], prefix='Event')

In [None]:
## Check Display
pd.set_option('display.max_columns', None)

event_one_hot
#user_one_hot

In [None]:
## Merge Onehot encoding with data user
user_data_encode = pd.concat([user_one_hot, user_skills_one_hot], axis=1)
user_data_encode

In [None]:
## Memisahkan event kualifikasi menjadi beberapa kolom terpisah untuk one hot
event_kualifikasi_split = event_dataset['Kualifikasi'].str.split(', ', expand=True)

## Create one-hot encoding for kualifikasi
event_kualifikasi_one_hot = pd.get_dummies(event_kualifikasi_split, prefix='Kualifikasi')

## Merge Kualifikasi with dataset event after one-hot kualifikasi
event_data_encode = pd.concat([event_one_hot, event_kualifikasi_one_hot], axis=1)


In [None]:
## Check event encode display
event_data_encode

In [None]:
print(user_data_encode.isnull().sum())
print(event_data_encode.isnull().sum())


In [None]:
## For set Y to target
#target_columns = ['Kualifikasi_kualifikasi1', 'Kualifikasi_kualifikasi2', ...]
#X_train, X_test, y_train, y_test = train_test_split(user_data_encode, event_data_encode[target_columns], test_size=0.2, random_state=42)



## Train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Nyoba proses nilai umur



```
###Pemrosesan Data Umur
def process_age(value):
    if '-' in str(value):  # Jika nilai adalah rentang umur
        age_range = value.split('-')
        return (int(age_range[0]) + int(age_range[1])) / 2
    elif isinstance(value, int):  # Jika nilai adalah umur tunggal dan sudah integer
        return value
    else:
        # Penanganan lainnya
        return None


event_dataset['Age'] = event_dataset['Age'].apply(process_age)

```



# Try and Try and Try

# Pusing NaN

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# Baca dataset
user_dataset = pd.read_csv("./users_dataset.csv")
event_dataset = pd.read_csv("./events_dataset.csv")

# Preprocessing user data
user_dataset['Skills'] = user_dataset['Skills'].str.lower()
user_dataset['Location'] = user_dataset['Location'].str.lower()
user_dataset['Type of Organization'] = user_dataset['Type of Organization'].str.lower()

# Preprocessing event data
event_dataset['Qualifications'] = event_dataset['Qualifications'].str.lower()
event_dataset['Location'] = event_dataset['Location'].str.lower()
event_dataset['Category'] = event_dataset['Category'].str.lower()

# Gabungkan data
full_data = pd.merge(user_dataset, event_dataset, how='cross')

# Pisahkan data menjadi train dan test
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)

# One-hot encoding dan penggabungan data
user_skills_split = user_dataset['Skills'].str.split(', ', expand=True)
user_skills_one_hot = pd.get_dummies(user_skills_split, prefix='Skill')

user_one_hot_train = pd.get_dummies(train_data[[ 'Skills', 'Location', 'Type of Organization']], prefix='User')
event_one_hot_train = pd.get_dummies(train_data[['Category', 'Location', 'Qualifications']], prefix='Event')
event_kualifikasi_one_hot_train = pd.get_dummies(train_data['Qualifications'].str.split(', ', expand=True), prefix='Qualifications')

train_data_encode = pd.concat([user_one_hot_train, user_skills_one_hot, event_one_hot_train, event_kualifikasi_one_hot_train], axis=1)

# Normalisasi data
scaler = StandardScaler()
train_data_normalize = scaler.fit_transform(train_data_encode)

In [None]:
train_data_normalize = np.nan_to_num(train_data_normalize, nan=np.nanmean(train_data_normalize, axis=0))


In [None]:
print(np.isnan(train_data_normalize).any())


False


In [None]:
# Hitung similarity matrix menggunakan cosine similarity
similarity_matrix = cosine_similarity(train_data_normalize, train_data_normalize)

In [None]:
print(f"Cosine Similarity: {similarity_matrix[0][0]}")

# This one using vectorize


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Load the datasets
user_df = pd.read_csv('users_dataset.csv')
event_df = pd.read_csv('events_dataset.csv')

# Preprocessing
user_df['Skills'] = user_df['Skills'].apply(lambda x: ' '.join(x.lower().split(', ')) if pd.notnull(x) else '')
event_df['Qualifications'] = event_df['Qualifications'].apply(lambda x: ' '.join(x.lower().split(', ')) if pd.notnull(x) else '')

# Vectorize the skills and qualifications
vectorizer = TfidfVectorizer(stop_words='english')
user_matrix = vectorizer.fit_transform(user_df['Skills'])
event_matrix = vectorizer.transform(event_df['Qualifications'])

# Compute the cosine similarity
cosine_sim = linear_kernel(user_matrix, event_matrix)

# Function to get recommendations
def get_recommendations(user_index, cosine_sim=cosine_sim):
    # Get the pairwsie similarity scores of all events for that user
    sim_scores = list(enumerate(cosine_sim[user_index]))

    # Sort the events based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar events
    sim_scores = sim_scores[0:10]

    # Get the event indices
    event_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar events
    return event_df['Event_id'].iloc[event_indices]

# Test the system relation user 1 (index 0) to event
print(get_recommendations(0))


2592    E_2593
2683    E_2684
2723    E_2724
2735    E_2736
2744    E_2745
2531    E_2532
2659    E_2660
2668    E_2669
2776    E_2777
2399    E_2400
Name: Event_id, dtype: object


# This one using Tokenizer NLP


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the datasets
user_df = pd.read_csv('users_dataset.csv')
event_df = pd.read_csv('events_dataset.csv')

# Preprocessing
user_df['Skills'] = user_df['Skills'].apply(lambda x: ' '.join(x.lower().split(', ')) if pd.notnull(x) else '')
event_df['Qualifications'] = event_df['Qualifications'].apply(lambda x: ' '.join(x.lower().split(', ')) if pd.notnull(x) else '')

# Tokenize the skills and qualifications
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([user_df['Skills'], event_df['Qualifications']]))

user_sequences = tokenizer.texts_to_sequences(user_df['Skills'])
event_sequences = tokenizer.texts_to_sequences(event_df['Qualifications'])

# Pad the sequences
user_data = pad_sequences(user_sequences)
event_data = pad_sequences(event_sequences)

# Define the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
num_filters = 10
kernel_size = 3

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=user_data.shape[1]),
    Conv1D(num_filters, kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(user_data, np.ones(len(user_data)), epochs=5, verbose=1)

# Compute recommendations
user_embeddings = model.get_layer(index=0).get_weights()[0]
event_embeddings = model.get_layer(index=0).get_weights()[0]

def recommend_events(user_id, num_recommendations=5):
    user_embedding = user_embeddings[user_id]
    similarities = np.dot(event_embeddings, user_embedding)
    event_ids = np.argsort(-similarities)[:num_recommendations]
    return event_df['Event_id'].iloc[event_ids]

# Test the recommendation system
# print(recommend_events(0))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Test the recommendation system
print(recommend_events(1))

1     E_2,Youth Development,Jakarta,>18,"Mentoring, ...
9     E_10,Youth Development,"Maluku, Banda Neira",1...
29                                                 E_30
73                                                 E_74
49    E_50,Youth Development,Bandung,>20,"Team build...
Name: Event_id, dtype: object


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras import layers

# Load your datasets
events = pd.read_csv('events_dataset.csv')
users = pd.read_csv('users_dataset.csv')

# Preprocessing
events['Qualifications'] = events['Qualifications'].apply(lambda x: ' '.join(x.lower().split(',')) if pd.notnull(x) else '')
users['Skills'] = users['Skills'].apply(lambda x: ' '.join(x.lower().split(','))if pd.notnull(x) else '')

# Vectorize the qualifications and skills
vectorizer = TfidfVectorizer(stop_words='english')
events_matrix = vectorizer.fit_transform(events['Qualifications'])
users_matrix = vectorizer.transform(users['Skills'])

# Compute the cosine similarity
cosine_sim = cosine_similarity(users_matrix, events_matrix)

# Convert the cosine similarity matrix to a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, columns=events['Event_id'], index=users['Volunteer Name'])

# Build the model
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(cosine_sim_df.columns)]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

target = users['Volunteer Name']

# Compile the model
model.compile(loss='mean_absolute_error', optimizer=tf.keras.optimizers.Adam(0.001))

# Train the model
model.fit(cosine_sim_df,target ,epochs=10)


# ⛹

# Using One Hot and Tokenizer 🉐

---



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder

# Load Event dataset
event_data = pd.read_csv("./events_dataset.csv",usecols=['Event_id','Category','Location','Qualifications'])
event_df = pd.DataFrame(event_data,)

# Load User dataset
user_data = pd.read_csv("./users_data_interest.csv",usecols=['Volunteer Name','Skills','Location','Type of Organization','Interest'])
user_df = pd.DataFrame(user_data)

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000

# Split event and user data into training and testing sets
event_train, event_test = train_test_split(event_df, test_size=0.2, random_state=42)
user_train, user_test = train_test_split(user_df, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(user_df['Interest'], test_size=0.2, random_state=42)

# Tokenizer Train and Test Qualifications
tokenizer_qualification = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_qualification.fit_on_texts(event_train['Qualifications'])

qualification_seq = tokenizer_qualification.texts_to_sequences(event_train['Qualifications'])
qualification_pad = pad_sequences(qualification_seq, maxlen=max_length, padding=padding_type, truncating= trunc_type)

qualification_seq_test = tokenizer_qualification.texts_to_sequences(event_test['Qualifications'])
qualification_pad_test = pad_sequences(qualification_seq_test, maxlen=max_length, padding=padding_type, truncating= trunc_type)

# Tokenizer Train and Test Skill
tokenizer_skill = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_skill.fit_on_texts(user_train['Skills'])

skill_seq = tokenizer_skill.texts_to_sequences(user_train['Skills'])
skill_pad = pad_sequences(skill_seq, maxlen=max_length, padding=padding_type, truncating= trunc_type)

skill_seq_test = tokenizer_skill.texts_to_sequences(user_test['Skills'])
skill_pad_test = pad_sequences(skill_seq_test, maxlen=max_length, padding=padding_type, truncating= trunc_type)

# One hot encoding Event
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
event_cat_loc_org_encoded_train = encoder.fit_transform(event_train[['Category', 'Location']])
event_cat_loc_org_encoded_test = encoder.transform(event_test[['Category', 'Location']])

# One hot encoding user
user_loc_org_encoded_train = encoder.fit_transform(user_train[['Location', 'Type of Organization']])
user_loc_org_encoded_test = encoder.transform(user_test[['Location', 'Type of Organization']])


# Build user model
user_NN = tf.keras.models.Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, name='user_embedding')  # output layer for user model
])

# Build event model
event_NN = tf.keras.models.Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, name='event_embedding')  # output layer for event model
])

# Inputs for user and event
input_user_skills = Input(shape=(max_length,), name='input_user_skills')
input_user_loc_org = Input(shape=(user_loc_org_encoded_train.shape[1],), name='input_user_loc_org')
input_event_qualifications = Input(shape=(max_length,), name='input_event_qualifications')
input_event_cat_loc_org = Input(shape=(event_cat_loc_org_encoded_train.shape[1],), name='input_event_cat_loc_org')

# Call user and event models
vu_skills = user_NN(input_user_skills)
vu_loc_org = Dense(128, activation='relu')(input_user_loc_org)
vu = Concatenate()([vu_skills, vu_loc_org])

vm_qualifications = event_NN(input_event_qualifications)
vm_cat_loc_org = Dense(128, activation='relu')(input_event_cat_loc_org)
vm = Concatenate()([vm_qualifications, vm_cat_loc_org])

combined_vu_vm = Concatenate()([vu, vm])

# Specify the inputs and outputs of the model
# model = tf.keras.Model([input_user_skills, input_user_loc_org, input_event_qualifications, input_event_cat_loc_org], [vu, vm])

# test with combined
model = Model([input_user_skills, input_user_loc_org, input_event_qualifications, input_event_cat_loc_org],combined_vu_vm)


In [2]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_user_skills (InputLa  [(None, 120)]                0         []                            
 yer)                                                                                             
                                                                                                  
 input_user_loc_org (InputL  [(None, 97)]                 0         []                            
 ayer)                                                                                            
                                                                                                  
 input_event_qualifications  [(None, 120)]                0         []                            
  (InputLayer)                                                                                

In [3]:
# Define inputs and outputs for training
train_inputs = [skill_pad, user_loc_org_encoded_train, qualification_pad, event_cat_loc_org_encoded_train]

In [4]:
model.fit(train_inputs, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e543a313b50>

In [5]:
test_inputs = [skill_pad_test, user_loc_org_encoded_test, qualification_pad_test, event_cat_loc_org_encoded_test]
model.evaluate(test_inputs, y_test)



0.003309612860903144

In [6]:
## using the user in the test set
user_index = 0
user_input = [
    skill_pad_test[user_index][None, ...],
    user_loc_org_encoded_test[user_index][None, ...],
    qualification_pad_test[user_index][None, ...],
    event_cat_loc_org_encoded_test[user_index][None, ...],
]

Langsung menggunakan predict

In [None]:
user_predictions = model.predict(user_input)

# Change prediksi menjadi DataFrame
predictions_df = pd.DataFrame(user_predictions.flatten(), columns=['Interest_Score'])

# Gabungkan prediksi dengan data event
results_df = pd.concat([event_test.reset_index(drop=True), predictions_df], axis=1)

# sort event berdasarkan 'Interest_Score'
results_df = results_df.sort_values(by='Interest_Score', ascending=False)

# Tampilkan 5 rekomendasi tertinggi
top_5_recommendations = results_df.head(5)
top_5_recommendations


Lalu ini menggunakan pembobotan terhadap category event dan type of organization user

In [7]:
# Buat prediksi dengan model Anda
user_predictions = model.predict(user_input)

# Ubah prediksi menjadi DataFrame
predictions_df = pd.DataFrame(user_predictions.flatten(), columns=['Interest_Score'])

# Gabungkan prediksi dengan data acara
results_df = pd.concat([event_test.reset_index(drop=True), predictions_df], axis=1)




In [None]:

# add nama pengguna dan 'Type of Organization' ke DataFrame hasil
results_df['User_Name'] = user_test.iloc[user_index]['Volunteer Name']
results_df['User_Organization_Type'] = user_test.iloc[user_index]['Type of Organization']

# kolom baru 'Interest_Score_Adjusted' yang memberikan bobot lebih tinggi
# untuk acara yang 'Category'-nya cocok dengan 'Type of Organization' pengguna
results_df['Interest_Score_Adjusted'] = np.where(results_df['Category'] == results_df['User_Organization_Type'],
                                                 results_df['Interest_Score'] * 1.2,
                                                 results_df['Interest_Score'])

# Urutkan acara berdasarkan 'Interest_Score_Adjusted' dalam urutan menurun
results_df = results_df.sort_values(by='Interest_Score_Adjusted', ascending=False)

# Tampilkan 5 rekomendasi tertinggi
top_5_recommendations = results_df.head(5)
top_5_recommendations

In [None]:
user_test.columns
user_test.head()

Unnamed: 0,Volunteer Name,Skills,Location,Type of Organization,Interest
2078,Ethan Walker,"Environmental activism, Conservation",Serang,Environmental,1
2770,James Cooper,"Veterinary assistance, Animal rescue",Lubuklinggau,Healthcare,1
1465,Amelia Adams,"Nursing, Geriatric care",Bengkulu,Youth Development,1
2089,Liam Thompson,"Teaching, English language tutoring",Palembang,Social,1
2118,Emily Davis,"Animal shelter volunteering, Pet adoption support",Binjai,Pet and Animal Service,1


In [None]:
name_at_index_one = user_test.loc[user_test.index[0], 'Volunteer Name']
print("Name at index one:", name_at_index_one)

Name at index one: Ethan Walker



# GIST Code
```
This to make predict, but not use bobot to category
# Make predictions for the user
user_predictions = model.predict(user_input)

# Extract event IDs from the test set
event_ids = event_test['Event_id'].values

# Flatten user_predictions and event_ids
user_predictions_flat = user_predictions.flatten()
event_ids_flat = event_ids[:user_predictions_flat.shape[0]]

user_predictions_flat.shape, event_ids_flat.shape


print("Length of event_ids:", len(event_ids))
print("Length of user_predictions:", len(user_predictions.flatten()))
print("Shape of user_predictions:", user_predictions.shape)
print("Number of unique event_ids:", len(event_test['Event_id'].unique()))


# Ambil informasi kategori dan kualifikasi dari dataset event_test
event_info_selected = event_test[event_test['Event_id'].isin(event_ids_flat)][['Event_id', 'Category', 'Qualifications']]

# Gabungkan hasil prediksi dan informasi event
results_df = pd.DataFrame({
    'Event_id': event_ids_flat,
    'Interest_Score': user_predictions_flat
})

# Gabungkan dengan informasi kategori dan kualifikasi
results_df = pd.merge(results_df, event_info_selected, on='Event_id')

# Sort events based on predicted interest scores in descending order
recommendations = results_df.sort_values(by='Interest_Score', ascending=False)

# Display the top N recommendations
top_n_recommendations = 5
top_recommendations = recommendations.head(top_n_recommendations)

print(f"Top {top_n_recommendations} recommendations for the user:")
# print(top_recommendations[['Event_id', 'Category', 'Qualifications', 'Interest_Score']])
top_recommendations






print("Shape of user_predictions before flatten:", user_predictions.shape)
print("Length of event_ids:", len(event_ids))

# Flatten user_predictions
user_predictions_flat = user_predictions.flatten()

print("Shape of user_predictions after flatten:", user_predictions_flat.shape)
print("Length of event_ids after adjustment:", len(event_ids[:user_predictions_flat.shape[0]]))

```




Trying use 2 condition skill=qualification and category=type of organization
```

results_df['User_Skills'] = user_test.iloc[user_index]['Skills']
results_df['Event_Qualifications'] = event_test.iloc[user_index]['Qualifications']

results_df['Skill_Qualification_Match'] = results_df.apply(lambda row: 1 if row['User_Skills'] in row['Event_Qualifications'] else 0, axis=1)

results_df['Category_Organization_Match'] = np.where(results_df['Category'] == results_df['User_Organization_Type'], 1, 0)

results_df['Interest_Score_Adjusted'] = np.where(results_df['Skill_Qualification_Match'] == 1,
                                                 results_df['Interest_Score'] * 1.1,
                                                 results_df['Interest_Score'])

```



# ===========================================

# Breakline



# This one if there is already have interact user to event
Jadi make dot.
```
## Inputs for user and event
# input_user_skills = Input(shape=(max_length,), name='input_user_skills')
# input_user_loc_org = Input(shape=(user_loc_org_encoded_train.shape[1],), name='input_user_loc_org')
# input_event_qualifications = Input(shape=(max_length,), name='input_event_qualifications')
# input_event_cat_loc_org = Input(shape=(event_cat_loc_org_encoded_train.shape[1],), name='input_event_cat_loc_org')


## Call user and event models
# vu_skills = user_NN(input_user_skills)
# vu_loc_org = Dense(128, activation='relu')(input_user_loc_org)
# vu = Concatenate()([vu_skills, vu_loc_org])

# vm_qualifications = event_NN(input_event_qualifications)
# vm_cat_loc_org = Dense(128, activation='relu')(input_event_cat_loc_org)
# vm = Concatenate()([vm_qualifications, vm_cat_loc_org])

# Compute the dot product of the two vectors vu and vm
# output = tf.keras.layers.Dot(axes=1)([vu, vm])

# Specify the inputs and output of the model
# model = tf.keras.Model([input_user_skills, input_user_loc_org, input_event_qualifications, input_event_cat_loc_org], output)
# model.summary()
```



# Another one

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.regularizers import l2

# Load Event dataset
event_data = pd.read_csv("./events_dataset.csv",usecols=['Event_id','Category','Location','Qualifications'])
event_df = pd.DataFrame(event_data,)

# Load User dataset
user_data = pd.read_csv("./users_dataset.csv",usecols=['Volunteer Name','Skills','Location','Type of Organization'])
user_df = pd.DataFrame(user_data)

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000

# Split event and user data into training and testing sets
event_train, event_test = train_test_split(event_df, test_size=0.2, random_state=42)
user_train, user_test = train_test_split(user_df, test_size=0.2, random_state=42)

# Tokenizer Train and Test Qualifications
tokenizer_qualification = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_qualification.fit_on_texts(event_train['Qualifications'])

qualification_seq = tokenizer_qualification.texts_to_sequences(event_train['Qualifications'])
qualification_pad = pad_sequences(qualification_seq, maxlen=max_length, padding=padding_type, truncating= trunc_type)

qualification_seq_test = tokenizer_qualification.texts_to_sequences(event_test['Qualifications'])
qualification_pad_test = pad_sequences(qualification_seq_test, maxlen=max_length, padding=padding_type, truncating= trunc_type)

# Tokenizer Train and Test Skill
tokenizer_skill = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_skill.fit_on_texts(user_train['Skills'])

skill_seq = tokenizer_skill.texts_to_sequences(user_train['Skills'])
skill_pad = pad_sequences(skill_seq, maxlen=max_length, padding=padding_type, truncating= trunc_type)

skill_seq_test = tokenizer_skill.texts_to_sequences(user_test['Skills'])
skill_pad_test = pad_sequences(skill_seq_test, maxlen=max_length, padding=padding_type, truncating= trunc_type)

# One hot encoding Event
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
event_cat_loc_org_encoded_train = encoder.fit_transform(event_train[['Category', 'Location']])
event_cat_loc_org_encoded_test = encoder.transform(event_test[['Category', 'Location']])

# One hot encoding user
user_loc_org_encoded_train = encoder.fit_transform(user_train[['Location', 'Type of Organization']])
user_loc_org_encoded_test = encoder.transform(user_test[['Location', 'Type of Organization']])

# User model
input_user_skills = Input(shape=(max_length,), name='input_user_skills')
x = Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length)(input_user_skills)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
vu_skills = Dense(64, name='user_embedding')(x)
x = Dense(128, activation='relu')(vu_skills)
output_user_skills = Dense(max_length, activation='sigmoid')(x)
user_NN = tf.keras.Model(input_user_skills, output_user_skills)

# Event model
input_event_qualifications = Input(shape=(max_length,), name='input_event_qualifications')
x = Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length)(input_event_qualifications)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
vm_qualifications = Dense(64, name='event_embedding')(x)
x = Dense(128, activation='relu')(vm_qualifications)
output_event_qualifications = Dense(max_length, activation='sigmoid')(x)
event_NN = tf.keras.Model(input_event_qualifications, output_event_qualifications)

# Compile the models
user_NN.compile(optimizer='adam', loss='mean_squared_error')
event_NN.compile(optimizer='adam', loss='mean_squared_error')




In [None]:
# Train the models
user_NN.fit(skill_pad, skill_pad, epochs=10, verbose=1)
event_NN.fit(qualification_pad, qualification_pad, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f93b01403d0>

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Generate embeddings for users and events
user_embeddings = user_NN.predict(skill_pad)
event_embeddings = event_NN.predict(qualification_pad)

# Select a specific user
user_index = 2  # replace this with the index of the user
user_embedding = user_embeddings[user_index]

# Compute similarity scores for the selected user and all events
similarity_scores = cosine_similarity(user_embedding.reshape(1, -1), event_embeddings)


# Get the indices of the events that have the highest similarity scores
top_event_indices = np.argsort(similarity_scores[0])[::-1]

# Select the top 10 events
top_10_event_indices = top_event_indices[:10]
top_10 = event_df.iloc[top_10_event_indices]

print("Top 10 event indices:")
top_10



Top 10 event indices:


Unnamed: 0,Event_id,Category,Location,Qualifications
0,E1,Youth Development,Solo,"Youth mentoring, Youth empowerment"
512,E513,IT,Jakarta,"Computer literacy, Web development"
1250,E1251,Healthcare,Yogyakarta,"Nutrition, Lab assistance, First aid"
1251,E1252,Healthcare,Jakarta,"Medical assistance, Lab assistance, Lab techni..."
509,E510,IT,Jakarta,"Data analysis, Computer literacy, Web development"
1253,E1254,Healthcare,Yogyakarta,Nursing
506,E507,IT,Bandung,"Graphic design, Computer programming, Data ana..."
505,E506,IT,Jawa Barat,"Customer service, Lab assistance"
1256,E1257,Healthcare,"Maluku, Banda Neira","First aid, Medical research, Lab technician"
504,E505,IT,Tangerang,"Visual communication, Web development"


# TESTTTTTTTTTTTTTTTTTTTT

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder

# Load Event dataset
event_data = pd.read_csv("./events_dataset.csv", usecols=['Event_id', 'Category', 'Location', 'Qualifications'])
event_df = pd.DataFrame(event_data)

# Load User dataset
user_data = pd.read_csv("./users_dataset.csv", usecols=['Volunteer Name', 'Skills', 'Location', 'Type of Organization'])
user_df = pd.DataFrame(user_data)

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000

# Tokenizer for Event Categories
tokenizer_category = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_category.fit_on_texts(event_df['Category'])
category_seq = tokenizer_category.texts_to_sequences(event_df['Category'])
category_pad = pad_sequences(category_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Tokenizer for Event Qualifications
tokenizer_qualification = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_qualification.fit_on_texts(event_df['Qualifications'])
qualification_seq = tokenizer_qualification.texts_to_sequences(event_df['Qualifications'])
qualification_pad = pad_sequences(qualification_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Tokenizer for User Skills
tokenizer_skill = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_skill.fit_on_texts(user_df['Skills'])
skill_seq = tokenizer_skill.texts_to_sequences(user_df['Skills'])
skill_pad = pad_sequences(skill_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Combine all features into a single matrix for both events and users
event_features = np.hstack((category_pad, qualification_pad))
user_features = skill_pad

# Build a simple content-based recommendation model
input_event_user = Concatenate()([Input(shape=(max_length,), name='input_event_category'),
                                   Input(shape=(max_length,), name='input_event_qualifications'),
                                   Input(shape=(max_length,), name='input_user_skills')])

x = Dense(128, activation='relu')(input_event_user)
x = Dense(64, activation='relu')(x)
output_recommendation = Dense(max_length, activation='sigmoid')(x)

content_based_model = Model(inputs=input_event_user,
                            outputs=output_recommendation)

# Compile the model
content_based_model.compile(optimizer='adam', loss='binary_crossentropy')
print(category_pad.shape)
print(qualification_pad.shape)
print(skill_pad.shape)
print(user_loc_org_encoded_train.shape)

# Train the model
# content_based_model.fit([category_pad, qualification_pad, skill_pad], user_loc_org_encoded_train, epochs=10, verbose=1)


(2786, 120)
(2786, 120)
(2786, 120)


NameError: ignored

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder

# Load Event dataset
event_data = pd.read_csv("./events_dataset.csv", usecols=['Event_id', 'Category', 'Location', 'Qualifications'])
event_df = pd.DataFrame(event_data)

# Load User dataset
user_data = pd.read_csv("./users_dataset.csv", usecols=['Volunteer Name', 'Skills', 'Location', 'Type of Organization'])
user_df = pd.DataFrame(user_data)

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000

# Split event and user data into training and testing sets
event_train, event_test = train_test_split(event_df, test_size=0.2, random_state=42)
user_train, user_test = train_test_split(user_df, test_size=0.2, random_state=42)

# Tokenizer Train and Test Qualifications
tokenizer_qualification = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_qualification.fit_on_texts(event_train['Qualifications'])

qualification_seq = tokenizer_qualification.texts_to_sequences(event_train['Qualifications'])
qualification_pad = pad_sequences(qualification_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

qualification_seq_test = tokenizer_qualification.texts_to_sequences(event_test['Qualifications'])
qualification_pad_test = pad_sequences(qualification_seq_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Tokenizer Train and Test Skill
tokenizer_skill = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer_skill.fit_on_texts(user_train['Skills'])

skill_seq = tokenizer_skill.texts_to_sequences(user_train['Skills'])
skill_pad = pad_sequences(skill_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

skill_seq_test = tokenizer_skill.texts_to_sequences(user_test['Skills'])
skill_pad_test = pad_sequences(skill_seq_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Create labels for classification (1 for positive match, 0 for no match)
labels_train = np.ones(len(user_train))
labels_test = np.ones(len(user_test))

# User model
input_user_skills = Input(shape=(max_length,), name='input_user_skills')
x = Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length)(input_user_skills)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
vu_skills = Dense(64, name='user_embedding')(x)
x = Dense(128, activation='relu')(vu_skills)
output_user_skills = Dense(1, activation='sigmoid')(x)
user_NN = Model(inputs=input_user_skills, outputs=output_user_skills)

# Event model
input_event_qualifications = Input(shape=(max_length,), name='input_event_qualifications')
x = Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length)(input_event_qualifications)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
vm_qualifications = Dense(64, name='event_embedding')(x)
x = Dense(128, activation='relu')(vm_qualifications)
output_event_qualifications = Dense(1, activation='sigmoid')(x)
event_NN = Model(inputs=input_event_qualifications, outputs=output_event_qualifications)

# Compile the models
user_NN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
event_NN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the models
user_NN.fit(skill_pad, labels_train, epochs=10, verbose=1)
event_NN.fit(qualification_pad, labels_train, epochs=10, verbose=1)

# Evaluate the models on the test set
user_preds = user_NN.predict(skill_pad_test)
event_preds = event_NN.predict(qualification_pad_test)

# Convert predictions to binary (1 if prediction > 0.5, else 0)
user_preds_binary = (user_preds > 0.5).astype(int)
event_preds_binary = (event_preds > 0.5).astype(int)

# Compute accuracy on the test set
user_accuracy = accuracy_score(labels_test, user_preds_binary)
event_accuracy = accuracy_score(labels_test, event_preds_binary)

print(f"User Model Accuracy: {user_accuracy}")
print(f"Event Model Accuracy: {event_accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
User Model Accuracy: 1.0
Event Model Accuracy: 1.0
