In [5]:
import os
import sys
import numpy as np
import lightgbm as lgb
import category_encoders as ce
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
#pip install lightgbm==3.3.5

import lightgbm_utils as lgb_utils

print("System version: {}".format(sys.version))
print("LightGBM version: {}".format(lgb.__version__))

System version: 3.9.6 (default, Sep 26 2022, 11:37:49) 
[Clang 14.0.0 (clang-1400.0.29.202)]
LightGBM version: 3.3.5


In [9]:
#Defining lgbm parameters
MAX_LEAF = 128
MIN_DATA = 50
NUM_OF_TREES = 100
TREE_LEARNING_RATE = 0.1
EARLY_STOPPING_ROUNDS = 200
METRIC = "auc"
SIZE = "sample"

params = {
    "task": "train",
    "boosting_type": "gbdt",
    "num_class": 1,
    "objective": "binary",
    "metric": METRIC,
    "num_leaves": MAX_LEAF,
    "min_data": MIN_DATA,
    "boost_from_average": True,
    "num_threads": 20, 
    "feature_fraction": 0.8,
    "learning_rate": TREE_LEARNING_RATE,
}

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [13]:
label_encoder = LabelEncoder() 

def preprocess_df(path):
    df = pd.read_csv(path)
    df = df.dropna()

    # Dealing with the essay columns
    df['all_essays'] = df[['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']].apply(lambda x: ' '.join(x.dropna()), axis=1)
    df = df.drop(columns=['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'])
    df = df.drop(columns=['all_essays'])

    # Label encode categorical variables
    #label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])

    return df

def ratings_prediction(given_profile, df):
    # Compute cosine similarity between the given profile and all other profiles
    given_profile_df = pd.DataFrame([given_profile])
    given_profile_encoded = given_profile_df.copy()
    for col in given_profile_encoded.select_dtypes(include=['object']).columns:
        given_profile_encoded[col] = label_encoder.fit_transform(given_profile_encoded[col])

    # Create a DataFrame with the given profile repeated for each row to match the dimensions of df
    given_profile_expanded = pd.concat([given_profile_encoded]*len(df), ignore_index=True)
    
    # Compute cosine similarity between the given profile and all other profiles
    cosine_sim_given = cosine_similarity(given_profile_expanded, df)
    df['rating'] = cosine_sim_given.mean(axis=0)

    # Normalize the ratings to be between 0 and 1
    df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())

    return df

def preprocess_lgbm(path, df):
    df_lgbm = pd.read_csv(path)

    # Dealing with the essay columns
    df_lgbm['all_essays'] = df_lgbm[['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']].apply(lambda x: ' '.join(x.dropna()), axis=1)
    df_lgbm = df_lgbm.drop(columns=['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'])
    df_lgbm = df_lgbm.drop(columns=['all_essays'])

    #Adding the rating column gathered through Feature Engineering
    merged_df = df_lgbm.merge(df[['rating']], how='left', left_index=True, right_index=True)
    
    return merged_df

def encode_csv(df, encoder, label_col, typ="fit"):
    if typ == "fit":
        df = encoder.fit_transform(df)
    else:
        df = encoder.transform(df)
    y = df[label_col].values
    del df[label_col]
    return df, y

def training_lgbm(merged_df):
    #defining the columns from our df
    nume_cols = ["age", "height", "income"]
    cate_cols = ["body_type", "diet", "drinks", "drugs", "education", "ethnicity", "job", "last_online",
                "location", "offspring", "orientation", "pets", "religion", "sex", "sign", "smokes",
                "speaks", "status"]
    label_col = "rating"

    # split data to 3 sets    
    length = len(merged_df)
    train_data = merged_df.loc[:0.8*length-1]
    valid_data = merged_df.loc[0.8*length:0.9*length-1]
    test_data = merged_df.loc[0.9*length:]

    #Encoding categorical variables with the oerdinal encoder
    ord_encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)
    train_x, train_y = encode_csv(train_data, ord_encoder, label_col)
    valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, "transform")
    test_x, test_y = encode_csv(test_data, ord_encoder, label_col, "transform")

    lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params, categorical_feature=cate_cols)
    lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)
    lgb_test = lgb.Dataset(test_x, test_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)
    lgb_model = lgb.train(params,
                        lgb_train,
                        num_boost_round=NUM_OF_TREES,
                        valid_sets=lgb_valid,
                        categorical_feature=cate_cols,
                        callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS)])
    
    return lgb_model, ord_encoder

def generate_lightGBM_recommendations(df: pd.DataFrame, lgb_model, ord_encoder, number_of_recommendations: int = 10) -> list:
    label_col = "rating"
    # Make predictions for all profiles
    full_dataset_x, _ = encode_csv(df, ord_encoder, label_col, "transform")
    all_preds = lgb_model.predict(full_dataset_x)

    # Get sorted predictions with the highest one first
    top_indices = np.argsort(all_preds)[::-1]

    # Get the top recommendations
    recommendations = []
    counter = 0
    for index in top_indices:
        if counter == number_of_recommendations:
            break
        if not np.isnan(df.iloc[index][label_col]):
            continue
        else:
            counter += 1
            recommendations.append((index, all_preds[index]))

    return recommendations

In [14]:
#TO be called in main
path = "profiles.csv"

df_preprocessed = preprocess_df(path)

#Defining profile
given_profile = {
    'age': 26,
    'body_type': 'curvy',
    'diet': 'mostly anything',
    'drinks': 'socially',
    'drugs': 'never',
    'education': 'working on college/university',
    'ethnicity': 'hispanic / latin, white',
    'height': 63.0,
    'income': 20000,
    'job': 'sales / marketing / biz dev',
    'last_online': '2012-06-23-23-10',
    'location': 'berkeley, california',
    'offspring': 'doesn’t have kids, but might want them',
    'orientation': 'gay',
    'pets': 'likes dogs and likes cats',
    'religion': 'catholicism and laughing about it',
    'sex': 'f',
    'sign': 'gemini and it’s fun to think about',
    'smokes': 'no',
    'speaks': 'english',
    'status': 'single'
}

#Computing the ratings
rated_df = ratings_prediction(given_profile, df_preprocessed)

#Preprocessing dataset for lgbm
preprocessed_lgbm = preprocess_lgbm(path, rated_df)

#training lgbm
lgb_model, ord_encoder = training_lgbm(preprocessed_lgbm)

#Getting recommendations
recommendations = generate_lightGBM_recommendations(preprocessed_lgbm, lgb_model, ord_encoder, 10)
print("Top 10 recommendations:")
for index, (profile_index, prediction) in enumerate(recommendations, start=1):
    print(f"Recommendation {index}:")
    print(f"Profile Index: {profile_index}, Prediction: {prediction}")

[LightGBM] [Info] Number of positive: 3557, number of negative: 44399
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5176
[LightGBM] [Info] Number of data points in the train set: 47956, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.074172 -> initscore=-2.524299
[LightGBM] [Info] Start training from score -2.524299
Training until validation scores don't improve for 200 rounds




Did not meet early stopping. Best iteration is:
[55]	valid_0's auc: 0.978698
Top 10 recommendations:
Recommendation 1:
Profile Index: 59008, Prediction: 0.9007063810574047
Recommendation 2:
Profile Index: 58642, Prediction: 0.8956106028457942
Recommendation 3:
Profile Index: 53144, Prediction: 0.8850699629720723
Recommendation 4:
Profile Index: 58340, Prediction: 0.8838723299329732
Recommendation 5:
Profile Index: 57179, Prediction: 0.8825521548313909
Recommendation 6:
Profile Index: 54206, Prediction: 0.881697215937644
Recommendation 7:
Profile Index: 50384, Prediction: 0.8791855945824912
Recommendation 8:
Profile Index: 48911, Prediction: 0.876613299043058
Recommendation 9:
Profile Index: 59264, Prediction: 0.8726835635823694
Recommendation 10:
Profile Index: 49035, Prediction: 0.8697389880014906
