In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Book Recommendation/Ratings.csv')

## Remove the entries with zero rating

In [3]:
df1=df[df['Book-Rating']!=0].reset_index().drop('index',axis=1)

In [4]:
df1

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276726,0155061224,5
1,276729,052165615X,3
2,276729,0521795028,6
3,276736,3257224281,8
4,276737,0600570967,6
...,...,...,...
433666,276704,0806917695,5
433667,276704,1563526298,9
433668,276709,0515107662,10
433669,276721,0590442449,10


## Sample 20% of the data

In [5]:
import random
df_choice= df1.sample(frac=0.2, axis=0, random_state=42).reset_index().drop('index', axis=1)

In [6]:
df_choice

Unnamed: 0,User-ID,ISBN,Book-Rating
0,32440,0380018179,8
1,49144,8422641127,5
2,156424,3442453844,7
3,7125,0060586125,8
4,251606,0316789844,8
...,...,...,...
86729,250770,03991468570,10
86730,25673,0393028747,10
86731,19573,0446679593,8
86732,157252,0440184053,10


## We cannot predict ratings for users that have no ratings in the train data. So, we have to filter the above sampled dataframe to include only those users that have at least one rating in the original dataset excluding the sampled data

In [7]:
df_choice_valuecounts= df_choice.groupby('User-ID').agg(Num_ratings=('Book-Rating','count')).reset_index()
df_choice_valuecounts

Unnamed: 0,User-ID,Num_ratings
0,17,1
1,32,1
2,42,1
3,44,1
4,56,1
...,...,...
29423,278828,1
29424,278843,6
29425,278844,2
29426,278851,5


**The following dataframe counts the number of ratings**

In [8]:
df1_valuecounts= df1.groupby('User-ID').agg(Num_ratings=('Book-Rating','count')).reset_index()
df1_valuecounts

Unnamed: 0,User-ID,Num_ratings
0,8,7
1,9,1
2,10,1
3,12,1
4,14,3
...,...,...
77800,278846,1
77801,278849,1
77802,278851,14
77803,278852,1


**The following does the same for the sampled data**

In [9]:
df1_valuecounts_subset= df1_valuecounts[df1_valuecounts['User-ID'].isin(df_choice_valuecounts['User-ID'])].reset_index().drop('index', axis=1)

In [10]:
df1_valuecounts_subset

Unnamed: 0,User-ID,Num_ratings
0,17,4
1,32,1
2,42,1
3,44,1
4,56,2
...,...,...
29423,278828,1
29424,278843,20
29425,278844,3
29426,278851,14


**Now we count the difference between original count and the count in the sampled data**

In [11]:
df1_valuecounts_subset['Difference']= df1_valuecounts_subset['Num_ratings']- df_choice_valuecounts['Num_ratings']

In [12]:
df1_valuecounts_subset

Unnamed: 0,User-ID,Num_ratings,Difference
0,17,4,3
1,32,1,0
2,42,1,0
3,44,1,0
4,56,2,1
...,...,...,...
29423,278828,1,0
29424,278843,20,14
29425,278844,3,1
29426,278851,14,9


**We want to exclude the entries that have difference 0**

In [13]:
df1_valuecounts_subset_diff= df1_valuecounts_subset[df1_valuecounts_subset['Difference']!=0].reset_index().drop('index', axis=1)
df1_valuecounts_subset_diff

Unnamed: 0,User-ID,Num_ratings,Difference
0,17,4,3
1,56,2,1
2,114,8,7
3,160,6,4
4,183,91,70
...,...,...,...
19930,278818,2,1
19931,278843,20,14
19932,278844,3,1
19933,278851,14,9


**The following is the data that we want to predict.**

In [14]:
topredict= df_choice[df_choice['User-ID'].isin(df1_valuecounts_subset_diff['User-ID'])]

In [15]:
topredict

Unnamed: 0,User-ID,ISBN,Book-Rating
0,32440,0380018179,8
1,49144,8422641127,5
2,156424,3442453844,7
3,7125,0060586125,8
4,251606,0316789844,8
...,...,...,...
86728,177458,1585671169,7
86730,25673,0393028747,10
86731,19573,0446679593,8
86732,157252,0440184053,10


In [16]:
topredict= topredict.sort_values(by=['User-ID']).reset_index().drop('index',axis=1)

In [17]:
topredict

Unnamed: 0,User-ID,ISBN,Book-Rating
0,17,0891075275,6
1,56,0679865691,9
2,114,0312953453,7
3,160,9727110843,8
4,160,9728579225,8
...,...,...,...
76772,278851,0843106743,7
76773,278851,067161746X,7
76774,278851,0439050006,5
76775,278851,1558531025,8


In [None]:
topredict.to_csv('/content/drive/MyDrive/Colab Notebooks/Book Recommendation/topredict.csv', index=False)

**Now we delete the values of the test dataset from the train dataset. In particular we replace those ratings by 0.**

In [18]:
train_df= df1.sort_values(by=['User-ID']).reset_index().drop('index',axis=1)
train_df

Unnamed: 0,User-ID,ISBN,Book-Rating
0,8,0002005018,5
1,8,1881320189,7
2,8,1575663937,6
3,8,074322678X,5
4,8,1552041778,5
...,...,...,...
433666,278854,042516098X,7
433667,278854,0553579606,8
433668,278854,0316184152,7
433669,278854,0375703063,7


In [19]:
topredzeros= topredict.copy()
topredzeros['Book-Rating']=0
topredzeros

Unnamed: 0,User-ID,ISBN,Book-Rating
0,17,0891075275,0
1,56,0679865691,0
2,114,0312953453,0
3,160,9727110843,0
4,160,9728579225,0
...,...,...,...
76772,278851,0843106743,0
76773,278851,067161746X,0
76774,278851,0439050006,0
76775,278851,1558531025,0


In [20]:
dp= pd.concat([train_df,topredzeros]).drop_duplicates(['User-ID','ISBN'],keep='last').sort_values('User-ID').reset_index(drop=True)

**dp is the dataset with 0's ratings for the entries in the test dataset.**

In [22]:
dp

Unnamed: 0,User-ID,ISBN,Book-Rating
0,8,0002005018,5
1,8,1881320189,7
2,8,1575663937,6
3,8,074322678X,5
4,8,1552041778,5
...,...,...,...
433666,278854,042516098X,7
433667,278854,0553579606,8
433668,278854,0375703063,7
433669,278854,0425163393,7


In [None]:
dp.to_csv('/content/drive/MyDrive/Colab Notebooks/Book Recommendation/trainwithzerostopredict.csv', index=False)