In [9]:
import os
import sys
import numpy as np
import lightgbm as lgb #it has to be brew installed !brew install lightgbm or pip install lightgbm==3.3.5

import pandas as pd
#import category_encoders as ce
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

#import recommenders.models.lightgbm.lightgbm_utils as lgb_utils

print("System version: {}".format(sys.version))
print("LightGBM version: {}".format(lgb.__version__))

System version: 3.11.5 (v3.11.5:cce6ba91b3, Aug 24 2023, 10:50:31) [Clang 13.0.0 (clang-1300.0.29.30)]
LightGBM version: 3.3.5


This notebook is intended to test the efficiency and strength of the light gbm model in our profile recommendation system. LightGBM is a powerful and efficient gradient boosting framework suitable for large-scale machine learning tasks, offering speed, memory efficiency, and accuracy. It's widely used in both industry and academia for a variety of machine learning problems, including classification, regression, and ranking tasks. 

First, we will get our data:

In [10]:
# Specify the path to your CSV file
csv_file_path = "/Users/marianareyes/Documents/GitHub/chatbots/chatbot/data.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,name,gender,age,year,major,nationality,languages,hobbies
0,John Smith,Male,20,2,Computer Science and Artificial Intelligence,USA,English,"Playing video games, reading, hiking"
1,Emily Johnson,Female,21,3,Business Administration,Canada,"English, French","Painting, playing guitar, photography"
2,Michael Williams,Male,22,4,Economics,UK,English,"Playing football, watching movies, traveling"
3,Sarah Brown,Female,20,1,Communication and Digital Media,Australia,English,"Writing, photography, dancing"
4,David Jones,Male,19,1,Architecture,Germany,"German, English","Drawing, playing piano, cooking"


Then, make sure that the information of our data is acccurate and makes sense. We also check for the types and if there are nulll values:

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         314 non-null    object
 1   gender       314 non-null    object
 2   age          314 non-null    int64 
 3   year         314 non-null    int64 
 4   major        314 non-null    object
 5   nationality  314 non-null    object
 6   languages    314 non-null    object
 7   hobbies      314 non-null    object
dtypes: int64(2), object(6)
memory usage: 19.8+ KB


In case there has been double documentation of a profile, we will drop the duplicates:

In [12]:
# Drop duplicates
df.drop_duplicates(inplace=True)

Since most of our entries are categorical (objects), we need to encode them so that we can use them for our model. In the case of the gender, we will be performing one hot encoding since it will either be female or male. For the rest of our object instances, we will be performing label encoding since there are many options (more variance) in them. 

In [13]:
# One-hot encoding gender
gender_encoder = OneHotEncoder()
gender_encoded = gender_encoder.fit_transform(df[['gender']])
gender_encoded_df = pd.DataFrame(gender_encoded.toarray(), columns=gender_encoder.categories_[0])

# Label encoding major, nationality, language, and hobbies
label_encoder = LabelEncoder()
df['major_encoded'] = label_encoder.fit_transform(df['major'])
df['nationality_encoded'] = label_encoder.fit_transform(df['nationality'])
df['language_encoded'] = label_encoder.fit_transform(df['languages'])
df['hobbies_encoded'] = label_encoder.fit_transform(df['hobbies'])

# Encode the 'name' column
name_encoder = LabelEncoder()
df['name_encoded'] = name_encoder.fit_transform(df['name'])

Now, after our data is ready, we will define the X and y of our models:

In [14]:
# Combine encoded features with numerical features
encoded_cols = ['major_encoded', 'nationality_encoded', 'language_encoded', 'hobbies_encoded']
numerical_cols = ['age', 'year']
X = np.concatenate([gender_encoded_df.values, df[encoded_cols].values, df[numerical_cols]], axis=1)
y = df['name_encoded']

LGBM define the parameters:

In [15]:
# Train LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': len(df['name_encoded'].unique()),  # Number of unique names/classes
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

lgb_dataset = lgb.Dataset(X, label=y)

Train our model:

In [16]:
# Train the model
model = lgb.train(params, lgb_dataset)

# Make predictions
test_preds = model.predict(X)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109
[LightGBM] [Info] Number of data points in the train set: 276, number of used features: 8
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -4.927254
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -4.927254
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM] [Info] Start training from score -5.620401
[LightGBM

Get metrics for our model to evaluate its performance:

In [17]:
# Evaluate model performance
auc = roc_auc_score(np.asarray(y), test_preds, multi_class='ovo')  # Using 'ovo' for multiclass AUC
logloss = log_loss(np.asarray(y), test_preds, eps=1e-12)
res_optim = {"auc": auc, "logloss": logloss}

print(res_optim)

{'auc': 0.9985191993464052, 'logloss': 0.45508605544967956}




From these metrics we can see that this model is performing very well although the loss could be diminished for a better performance. We will keep on iterating to improve these metrics. 

Testing retrieval of different profiles:

In [18]:
# Function to find top similar profiles given one profile
def find_similar_profiles(profile_index, df, model, top_n=5):
    profile = X[profile_index].reshape(1, -1)
    similarities = cosine_similarity(profile, X)
    similar_indices = np.argsort(similarities)[0][-top_n-1:-1][::-1]
    similar_profiles = df.iloc[similar_indices]
    return similar_profiles

# Example usage: Find top 5 similar profiles to the first profile in the dataset
similar_profiles = find_similar_profiles(0, df, model)
print(similar_profiles)

                  name gender  age  year  \
18         Noah Garcia   Male   20     2   
14        Logan Harris   Male   21     3   
151    Fahad Al-Sheikh   Male   22     4   
59   Mohammed Abdullah   Male   20     2   
30       Ethan Johnson   Male   20     2   

                                            major   nationality  \
18   Computer Science and Artificial Intelligence           UAE   
14                    Data and Business Analytics  South Africa   
151                                     Economics           UAE   
59   Computer Science and Artificial Intelligence  Saudi Arabia   
30                    Data and Business Analytics           USA   

              languages                                   hobbies  \
18      Arabic, English  Playing video games, coding, photography   
14   English, Afrikaans             Playing rugby, coding, hiking   
151     Arabic, English    Playing soccer, traveling, photography   
59      Arabic, English   Playing soccer, coding, watchi