In [142]:
#Global Settings and Imports
import os
import sys
import numpy as np
# import lightgbm as lgb
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.metrics.pairwise import cosine_similarity


#from chatbot import chatbot_data
#import lightgbm_utils as lgb_utils
#from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
#print("LightGBM version: {}".format(lgb.__version__))

System version: 3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]


In [143]:
#Parameters Setting
MAX_LEAF = 64
MIN_DATA = 20
NUM_OF_TREES = 100
TREE_LEARNING_RATE = 0.15
EARLY_STOPPING_ROUNDS = 20
METRIC = "auc"
SIZE = "sample"

In [144]:
params = {
    "task": "train",
    "boosting_type": "gbdt",
    "num_class": 1,
    "objective": "binary",
    "metric": METRIC,
    "num_leaves": MAX_LEAF,
    "min_data": MIN_DATA,
    "boost_from_average": True,
    # set it according to your cpu cores.
    "num_threads": 20,
    "feature_fraction": 0.8,
    "learning_rate": TREE_LEARNING_RATE,
}

In [145]:
# Specify the path to your CSV file
csv_file_path = "/Users/zaid/Desktop/University 3 - 2nd Sem/ChatBots-And-Recco/chatbots/chatbot/data.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,name,gender,age,year,major,nationality,languages,hobbies
0,John Smith,Male,20,2,Computer Science and Artificial Intelligence,USA,English,"Playing video games, reading, hiking"
1,Emily Johnson,Female,21,3,Business Administration,Canada,"English, French","Painting, playing guitar, photography"
2,Michael Williams,Male,22,4,Economics,UK,English,"Playing football, watching movies, traveling"
3,Sarah Brown,Female,20,1,Communication and Digital Media,Australia,English,"Writing, photography, dancing"
4,David Jones,Male,19,1,Architecture,Germany,"German, English","Drawing, playing piano, cooking"


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         314 non-null    object
 1   gender       314 non-null    object
 2   age          314 non-null    int64 
 3   year         314 non-null    int64 
 4   major        314 non-null    object
 5   nationality  314 non-null    object
 6   languages    314 non-null    object
 7   hobbies      314 non-null    object
dtypes: int64(2), object(6)
memory usage: 19.8+ KB


In [147]:
#Encoding categorical variables

#One-hot encoding gender
gender_encoder = OneHotEncoder()
gender_encoded = gender_encoder.fit_transform(df[['gender']])
gender_encoded_df = pd.DataFrame(gender_encoded.toarray(), columns=gender_encoder.categories_[0])


#Label encoding major, nationality, language, and hobbies
label_encoder = LabelEncoder()
df['major_encoded'] = label_encoder.fit_transform(df['major'])
df['nationality_encoded'] = label_encoder.fit_transform(df['nationality'])
df['language_encoded'] = label_encoder.fit_transform(df['languages'])
df['hobbies_encoded'] = label_encoder.fit_transform(df['hobbies'])

# Concatenate the one-hot encoded gender DataFrame with the original DataFrame
df = pd.concat([df, gender_encoded_df], axis=1)

# Drop the original categorical columns
df.drop(['gender', 'major', 'nationality', 'languages', 'hobbies'], axis=1, inplace=True)

df.head()

Unnamed: 0,name,age,year,major_encoded,nationality_encoded,language_encoded,hobbies_encoded,Female,Male
0,John Smith,20,2,5,33,4,39,0.0,1.0
1,Emily Johnson,21,3,3,5,7,14,1.0,0.0
2,Michael Williams,22,4,8,32,4,23,0.0,1.0
3,Sarah Brown,20,1,4,1,4,52,1.0,0.0
4,David Jones,19,1,1,10,11,7,0.0,1.0


In [148]:
from sklearn.metrics.pairwise import cosine_similarity

# Making our data set have numerical values
features = df.drop(columns=['name']) 

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Function to get the top 5 similar profiles for a given index
def get_similar_profiles(profile_index, similarity_matrix):
    profile_similarities = similarity_matrix[profile_index]
    # Get the indices of the profiles with the highest similarity scores
    # Exclude the first one as it will be the profile itself with a score of 1
    similar_indices = profile_similarities.argsort()[-6:][::-1][1:]
    return df['name'].iloc[similar_indices]

# Example: Get similar profiles for the first profile in the DataFrame
similar_profiles = get_similar_profiles(0, similarity_matrix)
print(similar_profiles)

18           Noah Garcia
14          Logan Harris
151      Fahad Al-Sheikh
185      Fahad Al-Sheikh
187    Mohammed Abdullah
Name: name, dtype: object


In [149]:
import numpy as np

# Generate a mock similarity matrix for a set number of profiles
num_profiles = 5
similarity_matrix = np.random.rand(num_profiles, num_profiles)
np.fill_diagonal(similarity_matrix, 1)  

def precision_at_k(similarity_matrix, profile_index, k=5, threshold=0.5):
    # Get indices of the top K most similar profiles, excluding the target profile itself
    top_k_indices = np.argsort(similarity_matrix[profile_index])[-k-1:-1][::-1]
    # Determine relevance of these recommendations based on the threshold
    relevant_recommendations = similarity_matrix[profile_index][top_k_indices] >= threshold
    # Calculate precision as the fraction of relevant recommendations
    precision = np.sum(relevant_recommendations) / k
    return precision

def recall_at_k(similarity_matrix, profile_index, k=5, threshold=0.5):
    # Identify all profiles considered relevant based on the threshold
    relevant_indices = np.where(similarity_matrix[profile_index] >= threshold)[0]
    relevant_indices = relevant_indices[relevant_indices != profile_index]  # Exclude self
    # Get indices of the top K recommendations and find how many are relevant
    top_k_indices = np.argsort(similarity_matrix[profile_index])[-k-1:-1][::-1]
    top_k_relevant = np.intersect1d(top_k_indices, relevant_indices).shape[0]
    # Calculate recall as the fraction of relevant profiles captured in top K recommendations
    recall = top_k_relevant / len(relevant_indices) if len(relevant_indices) > 0 else 0
    return recall

# Sample profile index
profile_index = 0  
# Top 3 recommendations to consider 
k = 3  

# Calculate precision and recall scores
precision_score = precision_at_k(similarity_matrix, profile_index, k)
recall_score = recall_at_k(similarity_matrix, profile_index, k)

# Output results
print(f"Precision@{k} for profile {profile_index}: {precision_score}")
print(f"Recall@{k} for profile {profile_index}: {recall_score}")


Precision@3 for profile 0: 0.3333333333333333
Recall@3 for profile 0: 1.0
