In [59]:
from src.libs.utils.tensorboard import create_tensorboard_log_dir
from typing import Any
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import re

In [60]:
!pip install tensorflow numpy tqdm pandas



**Creating DataFrames**

In [61]:
goodreads_train = pd.read_csv('kaggle/input/goodreads_train.csv')
goodreads_test = pd.read_csv('kaggle/input/goodreads_test.csv')

In [62]:
goodreads_train.shape

(900000, 11)

In [63]:
goodreads_test.shape

(478033, 10)

In [64]:
goodreads_train.sample(5)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
383398,a3b55cc2b5218d73a529ca797884f443,1295102,49f658271e41c829b3f98f4c3b920d8f,5,What an adorable book! Even if he is not the m...,Fri Feb 13 13:06:18 -0800 2015,Sat May 30 12:51:44 -0700 2015,Fri Apr 03 00:00:00 -0700 2015,Fri Feb 13 00:00:00 -0800 2015,0,0
899459,694e53b0b1a1b2fa52e661aec6092d02,25041504,de5e75cec0eb2dc0853f43b839b53e0f,3,three and a half \n not as good as the first b...,Thu Oct 06 19:09:55 -0700 2016,Mon Oct 10 19:42:32 -0700 2016,Mon Oct 10 00:00:00 -0700 2016,Thu Oct 06 00:00:00 -0700 2016,0,0
376179,025f6535878e5bb486ac3869e3c6e35a,2429135,83e9b0631349543bd5fc60b80352c5cc,3,Not as bad as I was expecting. And everyone ha...,Wed Feb 02 15:37:52 -0800 2011,Wed Feb 02 15:40:33 -0800 2011,Wed Jan 26 00:00:00 -0800 2011,,1,0
116073,f986c213af3348660c6143a42a77f0fc,7904453,730bf1da17dcfe7c2f85955fc4a0130b,4,An excellent followup to one of the most amazi...,Mon Jan 06 12:49:09 -0800 2014,Fri Jan 17 14:28:32 -0800 2014,Sat Jan 11 00:00:00 -0800 2014,Mon Jan 06 00:00:00 -0800 2014,0,1
127542,801ace581f75fa295064205aca31d326,22522808,cc6a566ebf0dfe46e6c10fa043ad9ccf,3,"2.5-3 stars \n Out of 25 stories and poems, I ...",Mon Mar 23 12:23:41 -0700 2015,Tue Mar 31 02:03:16 -0700 2015,Fri Mar 27 00:00:00 -0700 2015,Mon Mar 23 00:00:00 -0700 2015,0,0


**Cleaning DataFrames**

In [80]:
goodreads_train['rating'].value_counts()

4    313688
5    265007
3    188972
2     72627
0     30988
1     28718
Name: rating, dtype: int64

In [66]:
goodreads_train.set_index("review_id", inplace=True)
goodreads_test.set_index("review_id", inplace=True)

In [67]:
books = pd.concat([goodreads_train.drop('rating', axis=1), goodreads_test], axis=0)
books_rates = goodreads_train['rating']

In [68]:
books.shape

(1378033, 9)

In [69]:
books.sample(5)

Unnamed: 0_level_0,user_id,book_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
a5a58fd0d70aa55fe78292d1016d9f45,341e9338ee8427188e35ae9cdc5f0e91,26258306,Tis the season for adorable Christmas stories!...,Sat Sep 24 07:39:43 -0700 2016,Wed Jan 18 12:56:05 -0800 2017,Mon Dec 05 00:00:00 -0800 2016,Sun Dec 04 00:00:00 -0800 2016,0,0
d655d8584ff4016e86c6b5c47ffc8d86,a257bfb251a46d82ae1424d101984228,18126966,Not a boring moment to be found in the story.,Wed Mar 08 11:14:01 -0800 2017,Thu Mar 09 21:48:35 -0800 2017,Thu Mar 09 21:47:39 -0800 2017,Wed Mar 08 11:14:01 -0800 2017,2,0
b43d4faa1052a370c93a67ae751572e0,0e21b3f5aaf75a386e5f2c6a6c38d206,13644052,"That ending, oh my!!! Can't wait for Oppositio...",Mon Jul 08 09:18:22 -0700 2013,Sat Aug 02 19:43:04 -0700 2014,Sat Aug 02 00:00:00 -0700 2014,Fri Jul 18 00:00:00 -0700 2014,0,2
e51e586878394383f13bd6cb539cb14a,1aa34c596c1835eece458de9638dd974,126609,Great book about a woman trying to make it on ...,Mon Jul 07 21:28:13 -0700 2008,Thu Jul 10 14:55:23 -0700 2008,,,0,0
b0b6d1737566ecc042a3a0d91840016c,1ceef4796fb36190e72714895806835b,25785993,"I really liked this comic, but I also felt pre...",Sun Jan 17 19:31:03 -0800 2016,Sun Jan 17 19:33:30 -0800 2016,Fri Jan 15 00:00:00 -0800 2016,,0,0


In [70]:
books.drop(columns=['date_updated', 'read_at', 'started_at'], axis=1, inplace=True)

In [71]:
# Remove negative notes
books = books[(books['n_votes'] >= 0) & (books['n_comments'] >= 0)]

In [72]:
# Checking duplicates
sum(books.duplicated())

0

In [73]:
sum(books.duplicated(subset=['review_text']))

16130

In [74]:
books.drop_duplicates(subset=['review_text'], inplace=True)

In [75]:
unapp_texts = books['review_text'].apply(lambda txt: np.mean(list(map(len, txt.split())))) > 25
unapp_texts.shape

(1361800,)

In [76]:
books[unapp_texts].head()

Unnamed: 0_level_0,user_id,book_id,review_text,date_added,n_votes,n_comments
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ba7e149d70cb858d87d4113dfe0e4091,8092e33e9c50d3e269a8164177d2abdc,6339664,IhavetoadmitihavefalleninlovewithPatcicannotwa...,Sun Sep 30 12:29:48 -0700 2012,1,1
90529a603506931b91502b3c65b0564c,308c545182f8b5d332ff71c667fd9b61,12127810,the feels for NICO DI ANGELO.....................,Thu May 16 23:11:00 -0700 2013,0,0
7105506d7aa373be892e6d7d29750741,302eb3debfc02e0bdd0fd617d34a2713,10644930,"""The past is obdurate."" \n rtcrtcrtcrtcrtcrtcr...",Fri Mar 18 07:59:13 -0700 2016,0,0
ba4e577524730630bef88cb4c76c6dd7,de107a99cb0ed2ec5d90d744cc48c310,23723788,2.5 it's-a-good-book-as-in-even-after-months-I...,Sun Aug 30 18:18:44 -0700 2015,0,0
d9d104943954cc9d975a07d1a724be90,de107a99cb0ed2ec5d90d744cc48c310,23496782,2.5 It-Was-a-Very-Okay-Book-With-the-Mystery-a...,Thu Apr 09 04:41:42 -0700 2015,3,0


In [77]:
books.drop(books[unapp_texts].index, axis=0, inplace=True)

In [78]:
spoil_statement = 'spoiler alert'
books['spoil'] = [True if spoil_statement in txt else False for txt in books['review_text']]

In [79]:
books[books['spoil'] == True]

Unnamed: 0_level_0,user_id,book_id,review_text,date_added,n_votes,n_comments,spoil
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
e23965adda7ce7e7e415a1339e169065,8842281e1d1347389f2ab93d60773d4d,62291,** spoiler alert ** \n Loved it. The epic saga...,Tue Jul 12 12:57:07 -0700 2011,5,1,True
0258b3c9a85cecb95f240e43a7642f60,8842281e1d1347389f2ab93d60773d4d,7112495,** spoiler alert ** \n An interesting book in ...,Thu Jun 10 14:41:13 -0700 2010,7,3,True
d44b6d28f456dbcc5b1537b4c5572400,8842281e1d1347389f2ab93d60773d4d,6411961,"** spoiler alert ** \n Critics aside, Dan Brow...",Wed Sep 16 11:09:03 -0700 2009,3,3,True
d49eb786ef1d8557b9e6e86bcafd4570,8842281e1d1347389f2ab93d60773d4d,77432,"** spoiler alert ** \n I equally loved book 2,...",Tue May 05 18:08:22 -0700 2009,1,0,True
b8e08587ce1592d69548d0f644cb9893,8842281e1d1347389f2ab93d60773d4d,14497,** spoiler alert ** \n Just finished this and ...,Thu Mar 27 17:53:06 -0700 2008,6,1,True
...,...,...,...,...,...,...,...
a989c818917a7463089454b356fb6671,5ff0927d425ab1a70d2b2d41fe6b96b5,13239950,** spoiler alert ** \n Hills like White Elepha...,Wed Oct 31 18:22:50 -0700 2012,25,7,True
3736d5ee5904b4b5651912dd28b7ce01,a16fd6b48c6e877042acbe2bda8a0007,115076,** spoiler alert ** \n I was completely engros...,Mon Jun 04 07:18:34 -0700 2012,0,0,True
a7777a1c18ab582586a61ad453d91949,692ce8eb4d5490980f6e74f6cb8671e2,12974372,** spoiler alert ** \n Hmm...what to say about...,Wed Nov 04 12:33:22 -0800 2015,1,0,True
b1e1134e7e1e90725ac4237b66657de9,692ce8eb4d5490980f6e74f6cb8671e2,29044,** spoiler alert ** \n The first two-thirds of...,Mon Nov 24 16:03:39 -0800 2014,1,3,True
