## **Part 3 - Recommendation System**


### **Classify new user**

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

In [3]:
# Load clustered player data
df_players = pd.read_csv("clustered_players.csv")

# Load scaler and kmeans model
scaler = pd.read_pickle("scaler_model.pkl")
kmeans = pd.read_pickle("kmeans_model.pkl")

print(df_players)

def classify_and_add_user(new_user_df, players_df, kmeans_model, scaler_model):
    # Check if the DataFrame is empty
    if players_df.empty:
        # If empty, assign the new user to cluster 0
        new_user_df['Cluster_Labels'] = 0
        # Add the new user to the players DataFrame
        players_df = pd.concat([players_df, new_user_df], ignore_index=True)
    else:
        # Check if the new user already exists in the players DataFrame
        existing_user = players_df[players_df['Name'] == new_user_df['Name'].iloc[0]]

        if not existing_user.empty:
            # If the user exists, update the information (if needed)
            # For simplicity, let's assume we don't need to update any information
            print(f"User '{new_user_df['Name'].iloc[0]}' already exists in the dataset.")
        else:
            # Standardize the features of the new user
            new_user_std = scaler_model.transform(new_user_df[['Value(£)', 'Overall']])

            # Predict the cluster label for the new user
            cluster_label = kmeans_model.predict(new_user_std)[0]

            # Add the new user to the players DataFrame with the predicted cluster label
            new_user_df['Cluster_Labels'] = cluster_label
            players_df = pd.concat([players_df, new_user_df], ignore_index=True)

    # players_df = players_df.sort_values(by='Value(£)')

    return players_df


                  Name       Age   Overall               Club   Value(£)  \
0          L. Goretzka  0.834593  2.940553  FC Bayern München  11.256167   
1      Bruno Fernandes  0.834593  2.816114  Manchester United   9.661994   
2             M. Acuña  1.481188  2.691675         Sevilla FC   5.580910   
3         K. De Bruyne  1.696720  3.438311    Manchester City  13.360476   
4           N. Barella  0.403530  2.816114              Inter  11.064866   
...                ...       ...       ...                ...        ...   
17655    Deng Xiongtao -0.889660 -1.912583      Meizhou Hakka  -0.336662   
17656   22 Lim Jun Sub -1.320723 -1.912583     Jeju United FC  -0.336662   
17657         A. Demir  0.403530 -1.539264       Ümraniyespor  -0.340488   
17658     21 S. Czajor -1.105192 -1.663704     Fleetwood Town  -0.337938   
17659  21 F. Jakobsson -0.674128 -1.663704     IFK Norrköping  -0.337938   

      Age_Group Overall_Group  Log_Value  Age_Rating  Cluster_Labels  playerId  
0     

In [4]:
new_user_data = {
    'Name': 'New Player',
    'Age': 25,
    'Overall': 80,
    'Value(£)': 4000000,
    'playerId': len(df_players) + 1
}

In [5]:
df_players = classify_and_add_user(pd.DataFrame([new_user_data]), df_players, kmeans, scaler)
# print(df_players.info())
df_players

Unnamed: 0,Name,Age,Overall,Club,Value(£),Age_Group,Overall_Group,Log_Value,Age_Rating,Cluster_Labels,playerId
0,L. Goretzka,0.834593,2.940553,FC Bayern München,1.125617e+01,26-30,81-90,2.245328,1.938783,3,1
1,Bruno Fernandes,0.834593,2.816114,Manchester United,9.661994e+00,26-30,81-90,2.177662,1.877999,3,2
2,M. Acuña,1.481188,2.691675,Sevilla FC,5.580910e+00,31-35,81-90,1.937863,2.391289,3,3
3,K. De Bruyne,1.696720,3.438311,Manchester City,1.336048e+01,31-35,91-100,2.321635,3.001384,3,4
4,N. Barella,0.403530,2.816114,Inter,1.106487e+01,26-30,81-90,2.237716,1.490780,3,5
...,...,...,...,...,...,...,...,...,...,...,...
17656,22 Lim Jun Sub,-1.320723,-1.912583,Jeju United FC,-3.366625e-01,15-20,40-50,-0.874824,-1.512419,1,17657
17657,A. Demir,0.403530,-1.539264,Ümraniyespor,-3.404885e-01,26-30,51-60,-1.038158,-0.479084,1,17658
17658,21 S. Czajor,-1.105192,-1.663704,Fleetwood Town,-3.379378e-01,15-20,51-60,-0.923072,-1.323312,1,17659
17659,21 F. Jakobsson,-0.674128,-1.663704,IFK Norrköping,-3.379378e-01,21-25,51-60,-0.923072,-1.098185,1,17660


In [6]:
from datetime import datetime, timedelta

insurances_data = {
    'name': ['Insurance A', 'Insurance B', 'Insurance C', 'Insurance D', 'Insurance E', 'Insurance F', 'Insurance J'],
    'type': ['Normal', 'Premium', 'Deluxe', 'Ultimate', 'Normal', 'Ultimate', 'Ultimate'],
    'description': ['Description A', 'Description B', 'Description C', 'Description D', 'Description E', 'Description F', 'Description J'],
    'price': np.random.randint(1000, 10000, size=7),
    'date': [datetime.now() - timedelta(days=i) for i in range(7)]
}

# Create the insurance DataFrame
df_insurances = pd.DataFrame(insurances_data)

df_insurances['insuranceId'] = range(1, len(df_insurances) + 1)

# Display the updated insurances dataset
print(df_insurances)

          name      type    description  price                       date  \
0  Insurance A    Normal  Description A   9515 2024-01-22 02:51:01.960690   
1  Insurance B   Premium  Description B   8501 2024-01-21 02:51:01.960690   
2  Insurance C    Deluxe  Description C   8149 2024-01-20 02:51:01.960690   
3  Insurance D  Ultimate  Description D   7787 2024-01-19 02:51:01.960690   
4  Insurance E    Normal  Description E   3248 2024-01-18 02:51:01.960690   
5  Insurance F  Ultimate  Description F   4496 2024-01-17 02:51:01.960690   
6  Insurance J  Ultimate  Description J   4336 2024-01-16 02:51:01.960690   

   insuranceId  
0            1  
1            2  
2            3  
3            4  
4            5  
5            6  
6            7  


### **User-item collaborative filltering**

In [7]:

def insurance_of_cluser_players(player_id, df_players, df_insurances):
    # Find the cluster to which the player belongs
    player_cluster = df_players.loc[df_players['Name'] == player_id, 'Cluster_Labels'].values[0]

    # Retrieve all players from the same cluster
    similar_players = df_players[df_players['Cluster_Labels'] == player_cluster]

    # print(similar_players)

    # Assign insurance type based on the player's cluster
    cluster_type_mapping = {0: 'Normal', 1: 'Premium', 2: 'Deluxe', 3: 'Ultimate'}
    player_insurance_type = cluster_type_mapping.get(player_cluster)

    # Filter insurances based on the assigned type for the player's cluster
    recommended_insurances = df_insurances[df_insurances['type'] == player_insurance_type]

    return recommended_insurances

In [8]:
player_name = 'New Player'

# Get recommended insurances for the active player
cluster_insurances = insurance_of_cluser_players(player_name, df_players, df_insurances)

# print(cluster_insurances.insuranceId)

# Get similar players from the same cluster
player_cluster = df_players.loc[df_players['Name'] == player_name, 'Cluster_Labels'].values[0]
similar_players = df_players[df_players['Cluster_Labels'] == player_cluster]


ratings_data = {
    'playerId': np.random.choice(similar_players.playerId, size=200),
    'insuranceId': np.random.choice(cluster_insurances.insuranceId, size=200),
    'rating': np.random.randint(1, 6, size=200),
    'timestamp': [datetime.now() - timedelta(days=i) for i in range(200)]
}

# Create the ratings DataFrame
df_ratings = pd.DataFrame(ratings_data)


# Create the user-item matrix
user_item_matrix = pd.pivot_table(df_ratings, values='rating', index='playerId', columns='insuranceId', fill_value=0)

# Display the user-item matrix
print(user_item_matrix)
# print(user_item_matrix.info())


insuranceId    4    6    7
playerId                  
1            2.0  0.0  0.0
2            1.5  1.0  1.0
5            0.0  4.0  3.0
6            4.0  0.0  0.0
10           3.0  0.0  4.0
...          ...  ...  ...
12894        0.0  1.0  0.0
13507        5.0  0.0  4.0
14358        0.0  5.0  0.0
15430        4.0  1.0  4.0
15638        0.0  0.0  4.0

[105 rows x 3 columns]


In [9]:

# Fit NearestNeighbors
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(user_item_matrix)


def find_candidate_items(player_id, neighbors_model, user_item_matrix, df_ratings):
    # Check if the player exists in the dataset

    if player_id not in df_ratings['playerId'].unique():
        print(f"Player with ID {player_id} not found.")
        return []

    # Map the player ID to the corresponding index in user_item_matrix
    player_indices = user_item_matrix.index.get_indexer_for([player_id])

    # print(player_indices)
    # # Check if player_indices is empty
    # if not player_indices or player_indices[0] == -1:
    #     print(f"Player with ID {player_id} not found in user_item_matrix.")
    #     return []

    # Query for neighbors
    _, neighbor_indices = neighbors_model.kneighbors([user_item_matrix.iloc[player_indices[0]]], n_neighbors=5)

    # Flatten the array of neighbor indices
    neighbor_indices = neighbor_indices.flatten()

    # Filter ratings for similar users
    similar_users_ratings = df_ratings[df_ratings.index.isin(neighbor_indices)]

    # Sort items in decreasing order of frequency
    frequency = similar_users_ratings.groupby('insuranceId')['rating'].count().reset_index(name='count').sort_values(['count'], ascending=False)
    candidate_items = frequency['insuranceId'].tolist()

    # Exclude items already rated by the active player
    active_player_ratings = df_ratings[df_ratings['playerId'] == player_id]['insuranceId'].tolist()
    candidate_items = [item for item in candidate_items if item not in active_player_ratings]

    # Return the top 5 candidate items
    return candidate_items[:5]

player_id = 1  # Use the actual playerId as an integer
candidates = find_candidate_items(player_id, nn_model, user_item_matrix, df_ratings)
print("Candidate Items:", candidates)



Candidate Items: [6, 7]


In [10]:

cosine_sim = cosine_similarity(user_item_matrix)

# Step 1: Rating Prediction
def predict_ratings(active_player_index, user_item_matrix, cosine_sim, candidates):
    # Get the similarity scores for all players
    sim_scores = list(enumerate(cosine_sim[active_player_index]))

    # Sort the players based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of similar players
    similar_players_indices = [x[0] for x in sim_scores]

    # Get the ratings of the active player
    active_player_ratings = user_item_matrix.iloc[active_player_index]

    # Initialize a dictionary to store predicted ratings and total similarity scores
    predicted_ratings = {}
    total_similarity_scores = {}

    # Iterate over similar players and predict ratings
    for player_index in similar_players_indices:
        if player_index == active_player_index:
            continue  # Skip the active player

        # Get the ratings of the similar player
        similar_player_ratings = user_item_matrix.iloc[player_index]

        # Find items rated by the similar player that are in the candidate list
        candidate_items = set(similar_player_ratings[candidates].index)

        # Predict ratings for candidate items
        for item in candidate_items:
            if item not in predicted_ratings:
                predicted_ratings[item] = 0
                total_similarity_scores[item] = 0

            # Use the similarity score to predict the rating
            predicted_ratings[item] += sim_scores[player_index][1] * similar_player_ratings[item]
            total_similarity_scores[item] += sim_scores[player_index][1]

    # Normalize the predicted ratings between 1 and 5
    for item in predicted_ratings:
        if total_similarity_scores[item] != 0:
            predicted_ratings[item] /= total_similarity_scores[item]
            # Ensure the rating is between 1 and 5
            predicted_ratings[item] = min(5, max(1, predicted_ratings[item]))

    # Sort the predicted ratings in descending order
    predicted_ratings = dict(sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True))

    return predicted_ratings


# Step 2: User-to-User Predictions
def user_to_user_predictions(active_player_id, df_ratings, user_item_matrix, cosine_sim, nn_model):
    # Check if active_player_id exists in the dataset
    if active_player_id not in df_ratings['playerId'].unique():
        print(f"Player with ID {active_player_id} not found.")
        return {}

    # Map the active player ID to the corresponding index in user_item_matrix
    active_player_indices = user_item_matrix.index.get_indexer_for([active_player_id])

    # # Ensure the active player is found in the user_item_matrix
    # if not active_player_indices or active_player_indices[0] == -1:
    #     print(f"Active player with ID {active_player_id} not found in user_item_matrix.")
    #     return {}

    active_player_index = active_player_indices[0]

    # Find candidate items using the updated logic
    candidates = find_candidate_items(active_player_id, nn_model, user_item_matrix, df_ratings)

    # Ensure there are enough similar players to make predictions
    if cosine_sim.shape[0] <= active_player_index:
        return {}

    # Predict ratings based on user similarity
    predicted_ratings = predict_ratings(active_player_index, user_item_matrix, cosine_sim, candidates)

    return predicted_ratings



# Step 3: Top-5 Recommendations
def top_k_recommendations(predicted_ratings, k=5):
    top_recommendations = list(predicted_ratings.keys())[:k]
    return top_recommendations


active_player_id = 13507
predictions = user_to_user_predictions(active_player_id, df_ratings, user_item_matrix, cosine_sim, nn_model)
print("User-to-User Predictions:", predictions)
top_recommendations = top_k_recommendations(predictions, k=10)
print("Top 10 Recommendations:", top_recommendations)


predictions_df = pd.DataFrame(list(predictions.items()), columns=['Insurance', 'Predicted_Rating'])

# Export to CSV
csv_filename = f'predictions_{active_player_id}.csv'
predictions_df.to_csv(csv_filename, index=False)

print(f"Predictions exported to {csv_filename}")


User-to-User Predictions: {6: 1.423783887623673}
Top 10 Recommendations: [6]
Predictions exported to predictions_13507.csv
