In [2]:
#Importing Libraries
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import csv
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

In [3]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## User Embedding

In [None]:
#Load the cleaned user data
user_clean_data_file_path = "/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/user_cleaned_data.csv"

df_user = pd.read_csv(user_clean_data_file_path)

#preview the head data
df_user.head(5)

Unnamed: 0,deviceid,platform,os_version,networkType,lastknownsubadminarea,language_selected,created_datetime,last_active_at
0,197b123e-eb9e-4fc1-a32d-aa86aaea425e,ANDROID,13.0,4G,Unknown_Location,en,2023-07-11T13:40:05.511Z,2023-07-11T13:40:02.000Z
1,3c33c537-7c6c-40f5-835c-f997e883cae2,ANDROID,13.0,4G,Unknown_Location,en,2023-07-11T15:36:58.363Z,2023-07-11T16:24:44.000Z
2,6c7be5d0-d4d8-469f-91be-8055021ceef9,ANDROID,12.0,NO INTERNET,Unknown_Location,en,2023-07-11T11:25:39.375Z,2023-07-11T10:13:18.000Z
3,0801af66-0a6f-4fdd-82a9-c2b15757b8f5,ANDROID,7.0,NO INTERNET,Unknown_Location,en,2023-04-30T00:24:48.987Z,2023-07-10T11:53:21.000Z
4,78b3c7a7-5881-42dc-9f8e-b4fc27f94360,ANDROID,13.0,4G,Unknown_Location,en,2023-04-15T05:48:46.923Z,2023-07-20T15:37:47.000Z


In [None]:
df_user.shape

(10400, 8)

In [None]:
df_user["platform"].value_counts()

Unnamed: 0_level_0,count
platform,Unnamed: 1_level_1
ANDROID,10400


In [None]:
df_user["os_version"].value_counts()

Unnamed: 0_level_0,count
os_version,Unnamed: 1_level_1
13,3816
12,2494
11,1918
10,1166
9,529
8.1.0,259
7.0,52
8.0.0,36
7.1.1,29
7.1.2,29


In [None]:
df_user["networkType"].value_counts()

Unnamed: 0_level_0,count
networkType,Unnamed: 1_level_1
4G,6669
WIFI,3489
NO INTERNET,172
3G,57
2G,13


In [None]:
df_user["language_selected"].value_counts()

Unnamed: 0_level_0,count
language_selected,Unnamed: 1_level_1
en,10400


In [None]:
df_user["lastknownsubadminarea"].value_counts()

Unnamed: 0_level_0,count
lastknownsubadminarea,Unnamed: 1_level_1
Unknown_Location,908
Mumbai,651
Delhi,374
Bengaluru,367
Noida,268
...,...
Orillia,1
Cupertino,1
Keelakarai,1
Munnar,1


In [None]:
#single value in these column adding no variance
df_user = df_user.drop(columns=['platform'])

In [None]:
categorical_cols = ['lastknownsubadminarea', 'os_version', 'networkType','language_selected']

for col in categorical_cols:
    df_user[col + '_idx'] = df_user[col].astype('category').cat.codes

embedding_dims = {
    'lastknownsubadminarea': 16,
    'os_version': 4,
    'networkType': 4,
    'language_selected': 4
}

embeddings = {}
for col in categorical_cols:
    num_unique = df_user[col + '_idx'].nunique()
    embeddings[col] = nn.Embedding(num_unique, embedding_dims[col])


In [None]:
# Convert indices to tensors
area_idx = torch.tensor(df_user['lastknownsubadminarea_idx'].values, dtype=torch.long)
os_idx = torch.tensor(df_user['os_version_idx'].values, dtype=torch.long)
net_idx = torch.tensor(df_user['networkType_idx'].values, dtype=torch.long)
ln_idx = torch.tensor(df_user['language_selected_idx'].values, dtype=torch.long)

# Get embeddings
area_emb = embeddings['lastknownsubadminarea'](area_idx)
os_emb = embeddings['os_version'](os_idx)
net_emb = embeddings['networkType'](net_idx)
ln_emb = embeddings['language_selected'](ln_idx)


# Concatenate along feature dimension
user_embed = torch.cat([area_emb, os_emb, net_emb, ln_emb], dim=1)
print(user_embed.shape)  # [num_rows, sum_of_embedding_dims]

torch.Size([10400, 28])


In [None]:
from datetime import datetime
import pytz
import pandas as pd

# Ensure these columns are datetime objects
df_user['created_datetime'] = pd.to_datetime(df_user['created_datetime'])
df_user['last_active_at'] = pd.to_datetime(df_user['last_active_at'])

# Make datetime.now() timezone-aware
now_utc = datetime.now(pytz.utc)

# Calculate days since signup and last active
df_user['days_since_signup'] = (now_utc - df_user['created_datetime']).dt.days
df_user['days_since_last_active'] = (now_utc - df_user['last_active_at']).dt.days

In [None]:
num_cols = ['days_since_signup', 'days_since_last_active']

scaler = StandardScaler()
num_tensor = torch.tensor(scaler.fit_transform(df_user[num_cols]), dtype=torch.float)

In [None]:
user_embed_vec = torch.cat([user_embed, num_tensor], dim=1)

In [None]:
device_ids = df_user['deviceid'].values

# Assume user_embed_vec is a torch tensor of shape [num_users, embed_dim]
user_embed_np = user_embed_vec.detach().cpu().numpy()  # convert to numpy

# Option A: Keep as numpy arrays in-memory (recommended)
user_embed_df = pd.DataFrame({
    'deviceId': device_ids,
    'user_embed': list(user_embed_np)  # each cell is a np.array
})

# Save with pickle (preserves arrays perfectly)
user_embed_df.to_pickle("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/user_embed_df.pkl")

print(user_embed_df.head())

                               deviceId  \
0  197b123e-eb9e-4fc1-a32d-aa86aaea425e   
1  3c33c537-7c6c-40f5-835c-f997e883cae2   
2  6c7be5d0-d4d8-469f-91be-8055021ceef9   
3  0801af66-0a6f-4fdd-82a9-c2b15757b8f5   
4  78b3c7a7-5881-42dc-9f8e-b4fc27f94360   

                                          user_embed  
0  [0.778085, -0.46171418, -1.0421835, -1.6300508...  
1  [0.778085, -0.46171418, -1.0421835, -1.6300508...  
2  [0.778085, -0.46171418, -1.0421835, -1.6300508...  
3  [0.778085, -0.46171418, -1.0421835, -1.6300508...  
4  [0.778085, -0.46171418, -1.0421835, -1.6300508...  


## News Embedding

In [None]:
#Load the cleaned user data
news_clean_data_file_path = "/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/content_cleaned_data.csv"

df_news = pd.read_csv(news_clean_data_file_path)

#preview the head data
df_news.head(5)

Unnamed: 0,hashid,title,content,newsType,author,categories,createdAt,updatedAt,newsLanguage,sourceName,content_length
0,q6ymyudu-1,The world's most picturesque road trip awaits ...,The world's most picturesque road trip awaits ...,VIDEO_NEWS,5748e05c947ce445479635a0,travel,2023-06-04T08:30:15.000Z,2023-06-04T08:30:15.000Z,english,Unknown_Source,82
1,vowkjbc6-1,"In a viral video, Kathak dancers perform in a ...","In a viral video, Kathak dancers perform in a ...",VIDEO_NEWS,5748e05c947ce445479635a0,entertainment,2023-06-05T15:02:18.000Z,2023-06-05T15:02:18.000Z,english,Unknown_Source,118
2,8bf90wsx-1,Would you like to be a part of the 'no wash' m...,Would you like to be a part of the 'no wash' m...,VIDEO_NEWS,5748e05c947ce445479635a0,hatke,2023-06-02T09:22:13.000Z,2023-06-02T09:22:13.000Z,english,Unknown_Source,91
3,j0p0fukx-1,Switzerland's legendary Bernina Express turns 50,Switzerland's legendary Bernina Express turns 50,VIDEO_NEWS,5748e05c947ce445479635a0,travel,2023-06-04T13:24:40.000Z,2023-06-04T13:24:40.000Z,english,Unknown_Source,48
4,naqusao7-1,"Explored by many, claimed by none, this is Bir...","Explored by many, claimed by none, this is Bir...",VIDEO_NEWS,5748e05c947ce445479635a0,"hatke,travel",2023-06-04T13:20:20.000Z,2023-06-04T13:20:20.000Z,english,Unknown_Source,68


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Combine title + content
texts = (df_news['title'] + " " + df_news['content']).tolist()

# Encode all at once
text_embeds = model.encode(texts, convert_to_tensor=True, show_progress_bar=True)
print(text_embeds.shape)  # [num_news, 384] for MiniLM

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/457 [00:00<?, ?it/s]

torch.Size([14621, 384])


In [None]:
categorical_cols = ['newsType', 'author', 'categories', 'newsLanguage', 'sourceName']
embedding_dims = {
    'newsType': 4,
    'author': 8,
    'categories': 4,
    'newsLanguage': 2,
    'sourceName': 4
}

# Convert to integer indices
for col in categorical_cols:
    df_news[col + '_idx'] = df_news[col].astype('category').cat.codes

# Create embedding layers
embeddings = {}
for col in categorical_cols:
    num_unique = df_news[col + '_idx'].nunique()
    embeddings[col] = nn.Embedding(num_unique, embedding_dims[col])

# Lookup embeddings
cat_embeds = []
for col in categorical_cols:
    idx_tensor = torch.tensor(df_news[col + '_idx'].values, dtype=torch.long)
    cat_embeds.append(embeddings[col](idx_tensor))

cat_embeds = torch.cat(cat_embeds, dim=1)  # [num_news, sum_of_embedding_dims]

In [None]:
numerical_cols = ['content_length']
scaler = StandardScaler()
num_values = torch.tensor(scaler.fit_transform(df_news[numerical_cols]), dtype=torch.float)

In [None]:
# text_embeds: [num_news, 384]
# cat_embeds: [num_news, sum_embedding_dims]
# num_values: [num_news, num_numerical_features]

# Move cat_embeds and num_values to the same device as text_embeds
cat_embeds = cat_embeds.to(text_embeds.device)
num_values = num_values.to(text_embeds.device)

news_embed_vec = torch.cat([text_embeds, cat_embeds, num_values], dim=1)
print(news_embed_vec.shape)  # [num_news, total_embedding_dim]

torch.Size([14621, 407])


In [None]:
news_ids = df_news['hashid'].values

# Assume news_embed_vec is torch tensor [num_news, embed_dim]
news_embed_np = news_embed_vec.detach().cpu().numpy()

# Option A: keep as arrays in-memory
news_embed_df = pd.DataFrame({
    'hashId': news_ids,
    'news_embed': list(news_embed_np)
})
news_embed_df.to_pickle("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/news_embed_df.pkl")
print(news_embed_df.head())

       hashId                                         news_embed
0  q6ymyudu-1  [0.08181206, 0.06889709, 0.056240518, 0.055947...
1  vowkjbc6-1  [-0.011207085, -0.011110251, -0.010274678, -0....
2  8bf90wsx-1  [-0.01562647, 0.010021982, 0.070645474, 0.0643...
3  j0p0fukx-1  [0.04110528, 0.044924103, -0.0073990463, 0.056...
4  naqusao7-1  [0.02475829, 0.00413666, 0.019733615, -0.02729...


## Creating Label

In [None]:
#Load the cleaned interaction data
interaction_clean_data_file_path = "/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/interaction_cleaned_data.csv"

df_interaction = pd.read_csv(interaction_clean_data_file_path)

#preview the head data
df_interaction.head(5)

  df_interaction = pd.read_csv(interaction_clean_data_file_path)


Unnamed: 0,deviceId,event_type,eventTimestamp,hashId,categoryWhenEventHappened,cardViewPosition,overallTimeSpent,searchTerm,relevancy_color,relevancy_topic,state,locality,district,relevancy_score
0,7cb933f6-0a5b-477e-bb5d-42eb4b771970,TimeSpent-Front,1689189015000,2k4lruyx-1,My Feed,13.0,2.165,,UNKNOWN,,Unknown_Location,Unknown_Location,Unknown_Location,0
1,a1bfc2e4-c03e-4c1b-8abb-215808a89e6a,TimeSpent-Front,1689182865000,2k4lruyx-1,My Feed,17.0,3.306,,UNKNOWN,,Unknown_Location,Unknown_Location,Unknown_Location,0
2,1c53a149-303d-486e-ac62-0b9c9e469cda,TimeSpent-Front,1689180216000,2k4lruyx-1,My Feed,17.0,4.695,,UNKNOWN,,Unknown_Location,Unknown_Location,Unknown_Location,0
3,fea7a467-551c-4b64-ad48-eae0693635f3,TimeSpent-Front,1689173651000,2kk4ydgg-1,My Feed,25.0,4.238,,UNKNOWN,,Unknown_Location,Unknown_Location,Unknown_Location,0
4,1ef62ab0-e5da-4cb4-9839-49ac52f04ceb,TimeSpent-Front,1689174004000,2kk4ydgg-1,My Feed,12.0,0.931,,UNKNOWN,,Unknown_Location,Unknown_Location,Unknown_Location,0


In [None]:
interactions_df = df_interaction[['deviceId', 'hashId', 'overallTimeSpent', 'event_type']]

In [None]:
interactions_df['label'] = np.where(interactions_df['overallTimeSpent'] > 2, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df['label'] = np.where(interactions_df['overallTimeSpent'] > 2, 1, 0)


In [None]:
interactions_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,2281718
0,1262436


In [None]:
df_user = pd.read_pickle('/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/user_embed_df.pkl')
df_news = pd.read_pickle('/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/news_embed_df.pkl')

In [None]:
# ensure uniqueness
user_embed_df = df_user.drop_duplicates(subset=['deviceId'])
news_embed_df = df_news.drop_duplicates(subset=['hashId'])

# merge both embeddings into interaction df
final_df = (
    interactions_df
    .merge(user_embed_df, on='deviceId', how='inner')
    .merge(news_embed_df, on='hashId', how='inner')
    [['deviceId', 'hashId', 'user_embed', 'news_embed', 'label']]
)

In [None]:
final_df.shape

(103956, 5)

In [None]:
final_df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,64374
0,39582


In [None]:
final_df.to_pickle('/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/user_news_embed_data.pkl')

## Test News Embedding

In [None]:
#Load the cleaned test news data
test_news_clean_data_file_path = "/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/test_content_cleaned_data.csv"

df_test_news = pd.read_csv(test_news_clean_data_file_path)

#preview the head data
df_test_news.head(5)

Unnamed: 0,hashid,title,content,newsType,author,categories,createdAt,updatedAt,newsLanguage,sourceName
0,zdw0jrig-1,Redmi 12 5G will be a game-changer for 5G conn...,Xiaomi will debut Redmi 12 5G alongside Redmi ...,NEWS,593f9d1f81ef171ab3b63a2d,technology,2023-07-27T07:06:41.000Z,2023-07-27T07:06:41.000Z,english,Xiaomi
1,y5pfnbmp-1,Limited seats left for Hero Vired & MIT’s Prog...,Hero Group's EdTech company Hero Vired & MIT l...,NEWS,593f9d1f81ef171ab3b63a2d,education,2023-07-27T04:30:50.000Z,2023-07-27T04:30:50.000Z,english,vired.com
2,eo2eyhgk-1,Heavy to very heavy rainfall warning issued fo...,IMD has issued heavy to very heavy rainfall wa...,NEWS,5f70de9bd43821580e6d7022,national,2023-07-27T16:23:53.000Z,2023-07-27T17:12:11.285Z,english,IMD
3,fknyydal-1,Which 14 teams have qualified for 20-team T20 ...,Ireland and Scotland have qualified for the 20...,NEWS,5f70de9bd43821580e6d7022,sports,2023-07-27T14:29:59.000Z,2023-07-27T14:29:59.000Z,english,ICC
4,61ogen4w-1,42-year-old woman shot dead near her house in ...,A 42-year-old woman was shot dead near her hou...,NEWS,5f70de9bd43821580e6d7022,national,2023-07-27T17:07:31.000Z,2023-07-27T17:07:31.000Z,english,ABP


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Combine title + content
test_texts = (df_test_news['title'] + " " + df_test_news['content']).tolist()

# Encode all at once
test_text_embeds = model.encode(test_texts, convert_to_tensor=True, show_progress_bar=True)
print(test_text_embeds.shape)  # [num_news, 384] for MiniLM

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/43 [00:00<?, ?it/s]

torch.Size([1346, 384])


In [None]:
categorical_cols = ['newsType', 'author', 'categories', 'newsLanguage', 'sourceName']
embedding_dims = {
    'newsType': 4,
    'author': 8,
    'categories': 4,
    'newsLanguage': 2,
    'sourceName': 4
}

# Convert to integer indices
for col in categorical_cols:
    df_test_news[col + '_idx'] = df_test_news[col].astype('category').cat.codes

# Create embedding layers
embeddings = {}
for col in categorical_cols:
    num_unique = df_test_news[col + '_idx'].nunique()
    embeddings[col] = nn.Embedding(num_unique, embedding_dims[col])

# Lookup embeddings
cat_embeds = []
for col in categorical_cols:
    idx_tensor = torch.tensor(df_test_news[col + '_idx'].values, dtype=torch.long)
    cat_embeds.append(embeddings[col](idx_tensor))

cat_embeds = torch.cat(cat_embeds, dim=1)  # [num_news, sum_of_embedding_dims]

In [None]:
# Add a column 'content_length' = number of characters in the 'content' column
df_test_news["content_length"] = df_test_news["content"].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Verify
print(df_test_news[["content", "content_length"]].head())

                                             content  content_length
0  Xiaomi will debut Redmi 12 5G alongside Redmi ...             341
1  Hero Group's EdTech company Hero Vired & MIT l...             399
2  IMD has issued heavy to very heavy rainfall wa...             397
3  Ireland and Scotland have qualified for the 20...             374
4  A 42-year-old woman was shot dead near her hou...             343


In [None]:
numerical_cols = ['content_length']
scaler = StandardScaler()
num_values = torch.tensor(scaler.fit_transform(df_test_news[numerical_cols]), dtype=torch.float)

In [None]:
# text_embeds: [num_news, 384]
# cat_embeds: [num_news, sum_embedding_dims]
# num_values: [num_news, num_numerical_features]

# Move cat_embeds and num_values to the same device as text_embeds
cat_embeds = cat_embeds.to(test_text_embeds.device)
num_values = num_values.to(test_text_embeds.device)

news_embed_vec = torch.cat([test_text_embeds, cat_embeds,num_values], dim=1)
print(news_embed_vec.shape)  # [num_news, total_embedding_dim]

torch.Size([1346, 407])


In [None]:
test_news_ids = df_test_news['hashid'].values

# Assume news_embed_vec is torch tensor [num_news, embed_dim]
test_news_embed_np = news_embed_vec.detach().cpu().numpy()

# Option A: keep as arrays in-memory
test_news_embed_df = pd.DataFrame({
    'hashId': test_news_ids,
    'news_embed': list(test_news_embed_np)
})
test_news_embed_df.to_pickle("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/test_news_embed_df.pkl")
print(test_news_embed_df.head())

       hashId                                         news_embed
0  zdw0jrig-1  [-0.022536261, -0.08971395, 0.04197222, -0.020...
1  y5pfnbmp-1  [-0.023177247, -0.019574672, -0.017613271, -0....
2  eo2eyhgk-1  [0.010921052, -0.026420688, 0.12832528, 0.0065...
3  fknyydal-1  [0.04975502, -0.007751486, 0.037565373, -0.011...
4  61ogen4w-1  [-0.005561994, 0.026721455, -0.063384265, -0.0...


## User History Dataframe

In [4]:
# Load interaction and news embeddings
inter_df = pd.read_csv("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/interaction_cleaned_data.csv")
news_embed_df = pd.read_pickle("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/news_embed_df.pkl")  # has ['hashId', 'news_embed']

# Example: assume inter_df and news_embed_df are already loaded and merged
inter_df = inter_df.merge(news_embed_df[['hashId', 'news_embed']], on='hashId', how='inner')

# Create weight column (engagement strength)
inter_df['weight'] = inter_df['overallTimeSpent'].fillna(0) + inter_df['relevancy_score'].fillna(0)

# Normalize weights (avoid division by zero by adding epsilon)
max_wt = inter_df['weight'].max()
inter_df['weight'] = inter_df['weight'] / (max_wt + 1e-8)

# Convert embedding strings to vectors (if needed)
def parse_embed(e):
    if isinstance(e, str):
        return np.array([float(x) for x in e.strip('[]').split()])
    return np.array(e)

inter_df['news_embed'] = inter_df['news_embed'].apply(parse_embed)

# --- FIX: Handle zero-sum weights using Laplace correction ---
def compute_weighted_average(group):
    embeds = np.stack(group['news_embed'].values)
    weights = group['weight'].values

    total_wt = weights.sum()
    alpha = 0.1  # small blending factor
    if total_wt == 0:
      return embeds.mean(axis=0)
    else:
      weighted_mean = np.average(embeds, axis=0, weights=weights)
      simple_mean = embeds.mean(axis=0)
      return (1 - alpha) * weighted_mean + alpha * simple_mean

# Compute user embeddings safely
user_history_embeddings = inter_df.groupby('deviceId').apply(compute_weighted_average)

# Convert to DataFrame
user_history_df = pd.DataFrame({
    'deviceId': user_history_embeddings.index,
    'user_history_embed': list(user_history_embeddings.values)
})

print("✅ User history embeddings created successfully!")
user_history_df.to_pickle("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/user_history_df.pkl")
print(user_history_df.head())

  inter_df = pd.read_csv("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/interaction_cleaned_data.csv")


✅ User history embeddings created successfully!
                               deviceId  \
0  0002d448-f398-44d8-8806-df93d1dc770b   
1  000d4df6-dbdd-4438-8456-e23a6f52e1c5   
2  00198103-e45e-4b33-804b-84ff19562d62   
3  002a4793-c451-4916-b6c4-6a31c98cbd7c   
4  002da5aa-573b-441c-941b-df5597f1637e   

                                  user_history_embed  
0  [-0.007339994481359904, 0.01513829932653368, -...  
1  [-0.012232434604715608, 0.013919902768924417, ...  
2  [-0.019985300090691518, 0.018676520094226226, ...  
3  [-0.10931235570460557, 0.10427786465734244, -0...  
4  [-0.01257923560813261, -0.016158550419495293, ...  


  user_history_embeddings = inter_df.groupby('deviceId').apply(compute_weighted_average)


In [5]:
user_history_df.shape

(8977, 2)

## Interaction Matrix

In [6]:
#Load the cleaned interaction data
interaction_clean_data_file_path = "/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/interaction_cleaned_data.csv"

df_interaction = pd.read_csv(interaction_clean_data_file_path)

# Optional: assign weight to event_type
event_weights = {
    'view': 1.0,
    'click': 2.0,
    'share': 3.0,
    'bookmark': 4.0
}

df_interaction['event_weight'] = df_interaction['event_type'].map(event_weights).fillna(1.0)

# Composite score
df_interaction['interaction_score'] = (
    0.6 * df_interaction['overallTimeSpent'] +
    0.2 * df_interaction['event_weight'] +
    0.2 * (1 / (1 + df_interaction['cardViewPosition']))  # lower positions = higher weight
)

print(df_interaction[['deviceId', 'hashId', 'interaction_score']].head())

  df_interaction = pd.read_csv(interaction_clean_data_file_path)


                               deviceId      hashId  interaction_score
0  7cb933f6-0a5b-477e-bb5d-42eb4b771970  2k4lruyx-1           1.513286
1  a1bfc2e4-c03e-4c1b-8abb-215808a89e6a  2k4lruyx-1           2.194711
2  1c53a149-303d-486e-ac62-0b9c9e469cda  2k4lruyx-1           3.028111
3  fea7a467-551c-4b64-ad48-eae0693635f3  2kk4ydgg-1           2.750492
4  1ef62ab0-e5da-4cb4-9839-49ac52f04ceb  2kk4ydgg-1           0.773985


In [7]:
from scipy.sparse import csr_matrix

# Pivot to user–item matrix
interaction_matrix = df_interaction.pivot_table(
    index='deviceId',
    columns='hashId',
    values='interaction_score',
    aggfunc='mean',
    fill_value=0
)

print("Interaction matrix shape:", interaction_matrix.shape)

# Convert to sparse matrix for efficient storage
interaction_sparse = csr_matrix(interaction_matrix.values)

#To prevent outliers (e.g., users who spend 1000 seconds on an article) from dominating
interaction_matrix = np.log1p(interaction_matrix)  # log(1 + x) scaling

interaction_matrix.to_pickle("/content/drive/MyDrive/Companies Interview Projects/InShorts/cleaned_data/interaction_matrix.pkl")

Interaction matrix shape: (8977, 14622)
