In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

Load the data of user interaction with pratilipi 

In [12]:

user_interactions = pd.read_csv("user_interaction.csv")
user_interactions.info()
user_interactions.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500000 entries, 0 to 2499999
Data columns (total 4 columns):
 #   Column        Dtype  
---  ------        -----  
 0   user_id       int64  
 1   pratilipi_id  int64  
 2   read_percent  float64
 3   updated_at    object 
dtypes: float64(1), int64(2), object(1)
memory usage: 76.3+ MB


Unnamed: 0,user_id,pratilipi_id,read_percent
count,2500000.0,2500000.0,2500000.0
mean,5489174000000000.0,1369444000000000.0,93.24295
std,160670500000000.0,122175600000000.0,21.70149
min,3257553000000000.0,-5375940000000000.0,0.0
25%,5506792000000000.0,1377786000000000.0,100.0
50%,5506792000000000.0,1377786000000000.0,100.0
75%,5506792000000000.0,1377786000000000.0,100.0
max,5506792000000000.0,1377786000000000.0,2400.0


Load pratilipi data


In [13]:
metadata = pd.read_csv("metadata.csv")
metadata.info()
metadata.describe()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954501 entries, 0 to 954500
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   author_id      954501 non-null  int64 
 1   pratilipi_id   954501 non-null  int64 
 2   category_name  954501 non-null  object
 3   reading_time   954501 non-null  int64 
 4   updated_at     954501 non-null  object
 5   published_at   954494 non-null  object
dtypes: int64(3), object(3)
memory usage: 43.7+ MB


Unnamed: 0,author_id,pratilipi_id,reading_time
count,954501.0,954501.0,954501.0
mean,-2379597000000000.0,1368571000000000.0,351.979334
std,392202000000000.0,116110500000000.0,513.959547
min,-9070332000000000.0,-873461100000000.0,0.0
25%,-2270332000000000.0,1377786000000000.0,111.0
50%,-2270332000000000.0,1377786000000000.0,256.0
75%,-2270332000000000.0,1377786000000000.0,461.0
max,-2270332000000000.0,1377786000000000.0,78983.0


Left join the both the data  

In [14]:


df = user_interactions.merge(metadata, on="pratilipi_id", how="left")

df

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at_x,author_id,category_name,reading_time,updated_at_y,published_at
0,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291,-2.270332e+15,novels,376.0,2022-03-15 18:39:52,2022-03-15 18:39:52
1,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291,-2.270332e+15,family,376.0,2022-03-15 18:39:52,2022-03-15 18:39:52
2,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291,-2.270332e+15,romance,376.0,2022-03-15 18:39:52,2022-03-15 18:39:52
3,5506791971543560,1377786223038206,40.0,2022-03-19 13:49:25.660,-2.270332e+15,romance,361.0,2021-03-17 11:48:11,2021-03-16 19:09:19
4,5506791971543560,1377786223038206,40.0,2022-03-19 13:49:25.660,-2.270332e+15,suspense,361.0,2021-03-17 11:48:11,2021-03-16 19:09:19
...,...,...,...,...,...,...,...,...,...
4966616,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083,-2.270332e+15,novels,560.0,2021-12-08 15:08:01,2021-12-08 15:08:00
4966617,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083,-2.270332e+15,romance,560.0,2021-12-08 15:08:01,2021-12-08 15:08:00
4966618,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083,-2.270332e+15,suspense,560.0,2021-12-08 15:08:01,2021-12-08 15:08:00
4966619,5506791956021363,1377786226666757,100.0,2022-03-20 08:59:49.346,-2.270332e+15,novels,727.0,2021-12-03 18:10:45,2021-12-03 18:10:45


Convert timestamps to datetime


In [15]:
df["updated_at_x"] = pd.to_datetime(df["updated_at_x"])
df["updated_at_y"] = pd.to_datetime(df["updated_at_y"])
df["published_at"] = pd.to_datetime(df["published_at"])
print(df.head())

            user_id      pratilipi_id  read_percent            updated_at_x  \
0  5506791961876448  1377786228262109         100.0 2022-03-22 10:29:57.291   
1  5506791961876448  1377786228262109         100.0 2022-03-22 10:29:57.291   
2  5506791961876448  1377786228262109         100.0 2022-03-22 10:29:57.291   
3  5506791971543560  1377786223038206          40.0 2022-03-19 13:49:25.660   
4  5506791971543560  1377786223038206          40.0 2022-03-19 13:49:25.660   

      author_id category_name  reading_time        updated_at_y  \
0 -2.270332e+15        novels         376.0 2022-03-15 18:39:52   
1 -2.270332e+15        family         376.0 2022-03-15 18:39:52   
2 -2.270332e+15       romance         376.0 2022-03-15 18:39:52   
3 -2.270332e+15       romance         361.0 2021-03-17 11:48:11   
4 -2.270332e+15      suspense         361.0 2021-03-17 11:48:11   

         published_at  
0 2022-03-15 18:39:52  
1 2022-03-15 18:39:52  
2 2022-03-15 18:39:52  
3 2021-03-16 19:09:19  
4 

Aggregate multiple interactions by taking the maximum read percentage per user_id and pratilipi_id.


In [16]:
df_agg = df.groupby(["user_id", "pratilipi_id"])["read_percent"].max().reset_index()
df_agg

Unnamed: 0,user_id,pratilipi_id,read_percent
0,3257552805995172,1377786216957646,100.000000
1,3257552805995172,1377786220826675,100.000000
2,3257552805995172,1377786226782638,100.000000
3,3257552805995172,1377786227056508,100.000000
4,3257552805995172,1377786227250750,10.000000
...,...,...,...
2499995,5506791996685224,1377786215645840,43.552190
2499996,5506791996685251,1377786216362064,100.000000
2499997,5506791996685282,1377786222782765,100.000000
2499998,5506791996685286,1377786216009820,100.000000


Map user_id to a continuous index space


In [17]:

user_mapping = {user_id: idx for idx, user_id in enumerate(df_agg["user_id"].unique())}
user_mapping


{np.int64(3257552805995172): 0,
 np.int64(3257621147984548): 1,
 np.int64(3260243929637540): 2,
 np.int64(3260275089121956): 3,
 np.int64(3260433621754532): 4,
 np.int64(3263710062617252): 5,
 np.int64(3263998672675492): 6,
 np.int64(3264239159386788): 7,
 np.int64(3264294598124196): 8,
 np.int64(3264333346677412): 9,
 np.int64(3264359729373860): 10,
 np.int64(3264598431408804): 11,
 np.int64(3264802291360420): 12,
 np.int64(3265121492050596): 13,
 np.int64(3265199567446692): 14,
 np.int64(3265722787472036): 15,
 np.int64(3265725155680932): 16,
 np.int64(3267248272614052): 17,
 np.int64(3267517031031460): 18,
 np.int64(3267762257830564): 19,
 np.int64(3270953179456164): 20,
 np.int64(3271355698422436): 21,
 np.int64(3271758024450724): 22,
 np.int64(3272697963782820): 23,
 np.int64(3273849339552420): 24,
 np.int64(3273929965085348): 25,
 np.int64(3273968497631908): 26,
 np.int64(3274726695674532): 27,
 np.int64(3275083941323428): 28,
 np.int64(3275412156621476): 29,
 np.int64(3275822103

Map user_id to a continuous index space

In [18]:
pratilipi_mapping = {pratilipi_id: idx for idx, pratilipi_id in enumerate(df_agg["pratilipi_id"].unique())}
pratilipi_mapping

{np.int64(1377786216957646): 0,
 np.int64(1377786220826675): 1,
 np.int64(1377786226782638): 2,
 np.int64(1377786227056508): 3,
 np.int64(1377786227250750): 4,
 np.int64(1377786228209398): 5,
 np.int64(1377786224603341): 6,
 np.int64(1377786225023927): 7,
 np.int64(1377786221978573): 8,
 np.int64(1377786216736833): 9,
 np.int64(1377786225146500): 10,
 np.int64(1377786225224696): 11,
 np.int64(1377786225289397): 12,
 np.int64(1377786225314175): 13,
 np.int64(1377786225328747): 14,
 np.int64(1377786225517713): 15,
 np.int64(1377786225572728): 16,
 np.int64(1377786226415513): 17,
 np.int64(1377786226463441): 18,
 np.int64(1377786226673951): 19,
 np.int64(1377786226703010): 20,
 np.int64(1377786226755498): 21,
 np.int64(1377786226790484): 22,
 np.int64(1377786227180759): 23,
 np.int64(1377786227541370): 24,
 np.int64(1377786227978382): 25,
 np.int64(1377786227993851): 26,
 np.int64(1377786228007383): 27,
 np.int64(1377786228059012): 28,
 np.int64(1377786228121197): 29,
 np.int64(1377786228

Map the user_id column to a corresponding index using user_mapping and store it in a new column user_idx

In [19]:
df_agg["user_idx"] = df_agg["user_id"].map(user_mapping)
df_agg

Unnamed: 0,user_id,pratilipi_id,read_percent,user_idx
0,3257552805995172,1377786216957646,100.000000,0
1,3257552805995172,1377786220826675,100.000000,0
2,3257552805995172,1377786226782638,100.000000,0
3,3257552805995172,1377786227056508,100.000000,0
4,3257552805995172,1377786227250750,10.000000,0
...,...,...,...,...
2499995,5506791996685224,1377786215645840,43.552190,243601
2499996,5506791996685251,1377786216362064,100.000000,243602
2499997,5506791996685282,1377786222782765,100.000000,243603
2499998,5506791996685286,1377786216009820,100.000000,243604


Map the pratilipi_id column to a corresponding index using user_mapping and store it in a new column pratilipi_idx

In [20]:
df_agg["pratilipi_idx"] = df_agg["pratilipi_id"].map(pratilipi_mapping)
df_agg

Unnamed: 0,user_id,pratilipi_id,read_percent,user_idx,pratilipi_idx
0,3257552805995172,1377786216957646,100.000000,0,0
1,3257552805995172,1377786220826675,100.000000,0,1
2,3257552805995172,1377786226782638,100.000000,0,2
3,3257552805995172,1377786227056508,100.000000,0,3
4,3257552805995172,1377786227250750,10.000000,0,4
...,...,...,...,...,...
2499995,5506791996685224,1377786215645840,43.552190,243601,36485
2499996,5506791996685251,1377786216362064,100.000000,243602,24617
2499997,5506791996685282,1377786222782765,100.000000,243603,85982
2499998,5506791996685286,1377786216009820,100.000000,243604,12530


Create a CSR sparse matrix with "read_percent" as values, indexed by "user_idx" and "pratilipi_idx"

In [21]:
sparse_matrix = csr_matrix((df_agg["read_percent"], (df_agg["user_idx"], df_agg["pratilipi_idx"])))

Split the data into train and test data in 75-25 percent

In [22]:

train, test = train_test_split(df, test_size=0.25, random_state=42)

Train an ALS model with 50 factors, 0.1 regularization, and 20 iterations on the sparse matrix

In [23]:
model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)
model.fit(sparse_matrix)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

Create user & pratilipi mappings for matrix indexing

In [24]:

user_mapping = {user: idx for idx, user in enumerate(df_agg["user_id"].unique())}






In [25]:
reverse_user_mapping = {idx: user for user, idx in user_mapping.items()}


In [26]:
pratilipi_mapping = {pratilipi: idx for idx, pratilipi in enumerate(df_agg["pratilipi_id"].unique())}


In [27]:
reverse_pratilipi_mapping = {idx: pratilipi for pratilipi, idx in pratilipi_mapping.items()}
reverse_pratilipi_mapping

{0: np.int64(1377786216957646),
 1: np.int64(1377786220826675),
 2: np.int64(1377786226782638),
 3: np.int64(1377786227056508),
 4: np.int64(1377786227250750),
 5: np.int64(1377786228209398),
 6: np.int64(1377786224603341),
 7: np.int64(1377786225023927),
 8: np.int64(1377786221978573),
 9: np.int64(1377786216736833),
 10: np.int64(1377786225146500),
 11: np.int64(1377786225224696),
 12: np.int64(1377786225289397),
 13: np.int64(1377786225314175),
 14: np.int64(1377786225328747),
 15: np.int64(1377786225517713),
 16: np.int64(1377786225572728),
 17: np.int64(1377786226415513),
 18: np.int64(1377786226463441),
 19: np.int64(1377786226673951),
 20: np.int64(1377786226703010),
 21: np.int64(1377786226755498),
 22: np.int64(1377786226790484),
 23: np.int64(1377786227180759),
 24: np.int64(1377786227541370),
 25: np.int64(1377786227978382),
 26: np.int64(1377786227993851),
 27: np.int64(1377786228007383),
 28: np.int64(1377786228059012),
 29: np.int64(1377786228121197),
 30: np.int64(137778

Add index columns to DataFrame

In [28]:

df_agg["user_idx"] = df_agg["user_id"].map(user_mapping)
df_agg["pratilipi_idx"] = df_agg["pratilipi_id"].map(pratilipi_mapping)

Convert to sparse matrix (User-Item Matrix)


In [29]:

sparse_user_item = csr_matrix((df_agg["read_percent"], (df_agg["user_idx"], df_agg["pratilipi_idx"])))
sparse_user_item


<243606x241405 sparse matrix of type '<class 'numpy.float64'>'
	with 2500000 stored elements in Compressed Sparse Row format>

Initialize ALS model with 50 latent factors, 0.1 regularization, and 20 iterations and train it using transposed user-item matrix

In [30]:

model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)
model.fit(sparse_user_item.T) 





  0%|          | 0/20 [00:00<?, ?it/s]

ALS-based recommendation function  

In [31]:
def recommend_als(user_id, n=5):
    if user_id not in user_mapping:
        print(f"User {user_id} not found in training data.")
        return []

    user_idx = user_mapping[user_id]  # Convert user_id to matrix index
    user_row = sparse_user_item[user_idx]  # Extract the specific user vector

   
    scores = model.recommend(user_idx, user_row, N=n)

    #Convert float index to int before lookup
    recommended_pratilipis = [reverse_pratilipi_mapping[int(item[0])] for item in scores]

    return recommended_pratilipis


Get a valid user ID from dataset and generate, print ALS-based recommendations for the selected user.

In [32]:

user_id = df_agg["user_id"].sample(n=1, random_state=45).iloc[0]
print("ALS Recommendations:", recommend_als(user_id))

ALS Recommendations: [np.int64(1377786223862879), np.int64(1377786216957646)]


extract text-based features from the dataset and converts them into a numerical format using TF-IDF.

In [33]:
df["combined_features"] = df["category_name"].fillna("") + " " + df["author_id"].astype(str)

Converts text features into a numerical TF-IDF matrix for content-based analysis.

In [34]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["combined_features"])
tfidf_matrix

<4966621x9738 sparse matrix of type '<class 'numpy.float64'>'
	with 9605664 stored elements in Compressed Sparse Row format>

Nearest Neighbors for efficient similarity search


In [35]:

nn = NearestNeighbors(metric="cosine", algorithm="brute")
nn.fit(tfidf_matrix)
nn


Recommends top-N pratilipis similar to the given pratilipi using content-based filtering (TF-IDF + Nearest Neighbors).

In [36]:
def recommend_content_based(pratilipi_id, n=5):
    if pratilipi_id not in df["pratilipi_id"].values:
        return []
    
    idx = df[df["pratilipi_id"] == pratilipi_id].index[0]
    
    # Find top `n` similar pratilipis
    distances, indices = nn.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    
    recommended_pratilipis = df.iloc[indices.flatten()[1:]]["pratilipi_id"].tolist()
    
    return recommended_pratilipis

Combines ALS-based and content-based recommendations to provide a hybrid recommendation, balancing user preferences and similar pratilipis.

In [37]:
def hybrid_recommend(user_id, n=5):
    als_recs = recommend_als(user_id, n=3)
    
    content_recs = []
    for pratilipi in als_recs:
        content_recs.extend(recommend_content_based(pratilipi, n=5))
    
    final_recs = list(set(als_recs + content_recs))[:n]
    return final_recs

In [38]:
print("Hybrid Recommendations:", [int(pratilipi) for pratilipi in hybrid_recommend(user_id)])


Hybrid Recommendations: [1377786227377664, 1377786227831557, 1377786224270140, 1377786224236773, 1377786216957646]
