In [111]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import heapq
import scipy.sparse as sp
import math
# import cudf
# import cupy
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, ndcg_score
# from torchsummary import summary
from tqdm.notebook import tqdm

In [112]:
company_df = pd.read_csv('/home/krishnatejaswis/Files/VSCode/BugleRock/ML_Model_Predictor/Companies.csv')
mapping_df = pd.read_csv('/home/krishnatejaswis/Files/VSCode/BugleRock/ML_Model_Predictor/mapping.csv')
investor_df = pd.read_csv('/home/krishnatejaswis/Files/VSCode/BugleRock/ML_Model_Predictor/Investors.csv')

In [113]:
company_df.head()

Unnamed: 0,CompanyID,Company,Size,Profit,Industry
0,1,SPBSERE,322,66,Health
1,2,MYGLOLLSJ,399,87,Health
2,3,TIARHBRLK,481,70,Health
3,4,RQONOZ,301,38,Finance
4,5,MUYJBPD,227,79,Health


In [114]:
investor_df.head()

Unnamed: 0,InvestorID,Investor,RiskTolerance,TimeHorizon,IncomeNeeds,LiquidityNeeds,TaxConsiderations,AccountType,Age,InvestmentExperience,OtherAssets,RiskCapacity
0,1,NKYWMDI,High,Short term,99317,Low,No,Taxable,69,Novice,517824,Low
1,2,XXJAVAO,High,Long term,179685,Moderate,Yes,IRA,68,Experienced,561371,High
2,3,QLRCUVVV,Low,Short term,94268,High,No,401k,66,Novice,811498,Moderate
3,4,BBXLL,Moderate,Long term,70071,Low,No,401k,53,Experienced,254560,High
4,5,QHKLWOL,Low,Medium term,56600,Moderate,No,Taxable,73,Experienced,972287,High


In [115]:
mapping_df.head()

Unnamed: 0,InvestorID,CompanyID
0,1,29553
1,1,73163
2,1,90090
3,1,57148
4,1,47360


In [116]:
print(mapping_df.shape)

(25221, 2)


In [117]:
print(company_df.shape)
print(investor_df.shape)


(100000, 5)
(5000, 12)


In [118]:
#company_df['CompanyId'] =np.arange(company_df['CompanyID'].nunique(),dtype='int') 

In [119]:
company_df.head()


Unnamed: 0,CompanyID,Company,Size,Profit,Industry
0,1,SPBSERE,322,66,Health
1,2,MYGLOLLSJ,399,87,Health
2,3,TIARHBRLK,481,70,Health
3,4,RQONOZ,301,38,Finance
4,5,MUYJBPD,227,79,Health


In [120]:
# Assuming 'CompanyID' in companies_df corresponds to 'isbn' in book_df
# and 'InvestorID' in investor_df corresponds to 'user_id' in ratings_df

# Create a unique ID for each company
company_df['itemId'] = np.arange(company_df['CompanyID'].nunique(), dtype='int')

# Merge the mapping_df with the company_df
mapping_df = mapping_df.merge(company_df, on='CompanyID')

# Create a unique ID for each investor
investor_id = investor_df[['InvestorID']].drop_duplicates().reindex()
investor_id['userId'] = np.arange(len(investor_id))

# Merge the new investor IDs with the mapping_df
mapping_df = pd.merge(mapping_df, investor_id, on=['InvestorID'], how='left')

# Select the necessary columns
mapping_df = mapping_df[['userId', 'itemId']]

print(mapping_df.shape)
print('Range of userId is [{}, {}]'.format(mapping_df.userId.min(), mapping_df.userId.max()))
print('Range of itemId is [{}, {}]'.format(mapping_df.itemId.min(), mapping_df.itemId.max()))


(25221, 2)
Range of userId is [0, 4999]
Range of itemId is [1, 99998]


In [121]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset as torchDataset, DataLoader

# Collaborative filtering

In [122]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [123]:
class CompanyInvestorDataset(torchDataset):
    def __init__(self, df):
        self.users = df['userId'].values
        self.items = df['itemId'].values

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx]


In [124]:
class CFModel(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)


In [159]:
n_epochs = 18  # Or however many epochs you want to train for
n_users = investor_df['InvestorID'].nunique()
n_items = company_df['CompanyID'].nunique()
target = torch.ones(investor_df.shape[0], device=device)

print(n_epochs,n_items,n_users)

18 100000 5000


In [178]:
from sklearn.model_selection import train_test_split

# Split data into training set and validation set
train_df, val_df = train_test_split(mapping_df, test_size=0.35, random_state=42)

# Create DataLoaders for training set and validation set
train_dataset = CompanyInvestorDataset(train_df)
val_dataset = CompanyInvestorDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [179]:
# Initialize dataset and dataloader
#dataset = CompanyInvestorDataset(mapping_df)
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize model, loss function, and optimizer
model = CFModel(n_users, n_items).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# Training loop
for epoch in range(n_epochs):
    for investor, company in dataloader:
        # Move data to device
        investor = investor.to(device)
        company = company.to(device)

        # Create target tensor
        target = torch.ones(investor.shape[0], device=device)

        # Forward pass
        preds = model(investor, company)
        loss = loss_fn(preds, target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')



Epoch 1/18, Loss: 13.3964204788208
Epoch 2/18, Loss: 28.144372940063477
Epoch 3/18, Loss: 10.273414611816406
Epoch 4/18, Loss: 6.277699947357178
Epoch 5/18, Loss: 19.477210998535156
Epoch 6/18, Loss: 19.276613235473633
Epoch 7/18, Loss: 15.77616024017334
Epoch 8/18, Loss: 7.632597923278809
Epoch 9/18, Loss: 0.7359369397163391
Epoch 10/18, Loss: 5.202756404876709
Epoch 11/18, Loss: 3.2031781673431396
Epoch 12/18, Loss: 0.6636853218078613
Epoch 13/18, Loss: 0.17475873231887817
Epoch 14/18, Loss: 0.7610730528831482
Epoch 15/18, Loss: 5.27826452255249
Epoch 16/18, Loss: 0.6452124714851379
Epoch 17/18, Loss: 0.18331457674503326
Epoch 18/18, Loss: 0.9598404765129089


In [180]:
# Move data to device
investor_val = investor.to(device)
company_val = company.to(device)

# Forward pass
preds_val = model(investor_val, company_val)
# Create target tensor for validation data
target_val = torch.ones(investor_val.shape[0], device=device)

# Calculate MAE
mae = mean_absolute_error(preds_val.detach().cpu().numpy(), target_val.cpu().numpy())

print(f'MAE: {mae}')
from collections import defaultdict


MAE: 0.5304457545280457


In [181]:
def recommend(investor_id, model, n_items):
    # Set model to evaluation mode
    model.eval()

    # Create tensor of item IDs
    items = torch.arange(n_items, device=device)

    # Repeat investor ID for each item
    investors = torch.full((n_items,), investor_id, device=device)

    # Get predictions
    with torch.no_grad():
        preds = model(investors, items)

    # Get indices of top predictions
    top_items = preds.topk(k, largest=True).indices

    return top_items


In [182]:
def ndcg_at_k(predictions, k=10):
    investor_est_true = defaultdict(list)
    for uid, iid, true_r, est in predictions:
        investor_est_true[uid].append((est, true_r))

    def dcg_at_k(r, k):
        r = np.asfarray(r)[:k]
        return np.sum(r / np.log2(np.arange(2, len(r) + 2)))

    ndcgs = []
    for uid, investor_ratings in investor_est_true.items():
        investor_ratings.sort(key=lambda x: x[0], reverse=True)
        true_ratings = [true_r for (_, true_r) in investor_ratings]
        ndcgs.append(dcg_at_k(true_ratings, k) / dcg_at_k(sorted(true_ratings, reverse=True), k))

    return np.mean(ndcgs)



In [183]:
from sklearn.metrics import mean_absolute_error

def calculate_mae(model, dataloader):
    model.eval()
    with torch.no_grad():
        preds = []
        targets = []
        for investor, company in dataloader:
            investor = investor.to(device)
            company = company.to(device)
            target = torch.ones(investor.shape[0], device=device)
            pred = model(investor, company)
            preds.append(pred.cpu().numpy())
            targets.append(target.cpu().numpy())
        mae = mean_absolute_error(np.concatenate(targets), np.concatenate(preds))
    return mae


In [184]:

def precision_recall_at_k(predictions, k=10, threshold=1):
    investor_est_true = defaultdict(list)
    for uid, iid, true_r, est in predictions:
        investor_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, investor_ratings in investor_est_true.items():
        investor_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in investor_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in investor_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in investor_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls



In [185]:


# Assuming you have a validation DataLoader `val_dataloader`
val_predictions = []
model.eval()
with torch.no_grad():
    for investor, company in val_dataloader:
        investor = investor.to(device)
        company = company.to(device)
        target = torch.ones(investor.shape[0], device=device)
        pred = model(investor, company)
        val_predictions.extend([(i.item(), c.item(), t.item(), p.item()) for i, c, t, p in zip(investor, company, target, pred)])

# Calculate MAE
mae = calculate_mae(model, val_dataloader)
print(f'MAE: {mae}')

# Calculate precision and recall at k
precisions, recalls = precision_recall_at_k(val_predictions, k=5)
precision_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_k = sum(rec for rec in recalls.values()) / len(recalls)
print(f'Precision@k: {precision_k}')
print(f'Recall@k: {recall_k}')

# Calculate NDCG at k
ndcg_k = ndcg_at_k(val_predictions, k=5)
print(f'NDCG@k: {ndcg_k}')


MAE: 3.5623388290405273
Precision@k: 1.0
Recall@k: 0.4045947264463006
NDCG@k: 1.0


In [212]:
investor_id = 1  # The ID of the investor you want to generate recommendations for
k = 10  # The number of recommendations to generate

recommendations = recommend(investor_id, model, n_items)

print(f'Recommendations for investor {investor_id}: {recommendations}')


Recommendations for investor 1: tensor([67427, 78411, 62557, 27624, 63444, 99534, 85061, 66565, 72832, 57917],
       device='cuda:0')


In [213]:
recommended_companies = company_df[company_df['CompanyID'].isin(recommendations.cpu().numpy())]

print(recommended_companies)


       CompanyID    Company  Size  Profit Industry  itemId
27623      27624  SDSLETSJF   168      64  Finance   27623
57916      57917    TFWDPWI   247      28   Health   57916
62556      62557    AGRAAOE   179      12     Tech   62556
63443      63444   SYYHAUZJ   454      61     Tech   63443
66564      66565     IYHLHC    77      68     Tech   66564
67426      67427      TSLSJ   227      85     Tech   67426
72831      72832      LIGJK   167      63     Tech   72831
78410      78411  BNLBHYOXU   169      52     Tech   78410
85060      85061   VIDWWYXS   341      70   Health   85060
99533      99534      PCEGY   497      92     Tech   99533


In [214]:
# Save the model
torch.save(model.state_dict(), './newModel.pth')


In [215]:
# Load the model
model = CFModel(n_users, n_items)  # Initialize the model
model.load_state_dict(torch.load('model.pth'))  # Load the parameters
model.to(device)  # Move the model to the device


CFModel(
  (user_factors): Embedding(5000, 20)
  (item_factors): Embedding(100000, 20)
)

In [216]:
investor_id = 6  # The ID of the investor you want to generate recommendations for
k = 7  # The number of recommendations to generate

recommendations = recommend(investor_id, model, n_items)

print(f'Recommendations for investor {investor_id}: {recommendations}')

Recommendations for investor 6: tensor([44821, 75142, 98996, 30398, 95685, 84725, 23048], device='cuda:0')


In [217]:
recommended_companies = company_df[company_df['CompanyID'].isin(recommendations.cpu().numpy())]

print(recommended_companies)

       CompanyID    Company  Size  Profit Industry  itemId
23047      23048  AKLFVOXIT   424      86     Tech   23047
30397      30398   TBJDAGXL   329      80   Health   30397
44820      44821   JLLFKOKU   274      99   Health   44820
75141      75142    MODHOFB   134      31  Finance   75141
84724      84725  JYEOESVCP   407      61   Health   84724
95684      95685      FXKYE   199      47   Health   95684
98995      98996   YXCHDDVV   186      90   Health   98995


In [218]:
newUserID= int(input("Enter the User ID: "))
compsNeeded = int(input("Enter the number of companies needed: "))

In [219]:
# Load the model
model = CFModel(n_users, n_items)  # Initialize the model
model.load_state_dict(torch.load('newModel.pth'))  # Load the parameters
model.to(device)  # Move the model to the device

CFModel(
  (user_factors): Embedding(5000, 20)
  (item_factors): Embedding(100000, 20)
)

In [220]:
investor_id = newUserID # The ID of the investor you want to generate recommendations for
k = compsNeeded  # The number of recommendations to generate

recommendations = recommend(investor_id, model, n_items)

print(f'Recommendations for investor {investor_id}: {recommendations}')

Recommendations for investor 1: tensor([67427, 78411, 62557, 27624, 63444, 99534, 85061, 66565, 72832, 57917],
       device='cuda:0')


In [221]:
recommended_companies = company_df[company_df['CompanyID'].isin(recommendations.cpu().numpy())]

print(recommended_companies)

       CompanyID    Company  Size  Profit Industry  itemId
27623      27624  SDSLETSJF   168      64  Finance   27623
57916      57917    TFWDPWI   247      28   Health   57916
62556      62557    AGRAAOE   179      12     Tech   62556
63443      63444   SYYHAUZJ   454      61     Tech   63443
66564      66565     IYHLHC    77      68     Tech   66564
67426      67427      TSLSJ   227      85     Tech   67426
72831      72832      LIGJK   167      63     Tech   72831
78410      78411  BNLBHYOXU   169      52     Tech   78410
85060      85061   VIDWWYXS   341      70   Health   85060
99533      99534      PCEGY   497      92     Tech   99533
