In [10]:
import re
import nltk #Natural Language Toolkit
import mgzip
import pickle
import pandas as pd

from tqdm import tqdm

from os.path import exists

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
################################## PATHS ##################################

# Train dataset from Kaggle
ds_path = './data/goodreads_train.csv'

# Path to save the dataset with the reviews already cleaned
zip_path = './data/cleaned_ds.gz'

In [3]:
ps = PorterStemmer()
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dalex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def cleanText(sentence):
    sentence = re.sub('[^a-zA-Z]', ' ', sentence) ## ATT
    sentence = sentence.lower()
    sentence = sentence.split()
    sentence = [ps.stem(word) for word in sentence if not word in all_stopwords] 
    sentence = ' '.join(sentence)
    return sentence

In [11]:
if exists(zip_path):
    print('Opening cleaned dataset')
    with mgzip.open('mgzip_test.gz', 'rb') as f:
        mgzip_df = pickle.load(f)
else:
    print('Opening original dataset')
    reviews_df = pd.read_csv(ds_path)
    tqdm.pandas()
    print('Cleaning rewiews')
    reviews_df['cleaned_review_text'] = reviews_df['review_text'].progress_apply(lambda x : cleanText(x))
    with mgzip.open(zip_path, 'wb') as f:
        pickle.dump(reviews_df, f)

Opening original dataset
Cleaning rewiews


  0%|          | 1657/900000 [00:03<30:31, 490.54it/s]


KeyboardInterrupt: 

In [7]:
print(len(reviews_df))
reviews_df.head()

900000


Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments,cleaned_review_text
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1,special book start slow first third middl thir...
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0,recommend katz avail free decemb http www audi...
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0,fun fast pace scienc fiction thriller read nig...
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1,recommend read understand go middl america pos...
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1,realli enjoy book lot recommend drag littl end...


In [27]:
reviews_df.loc[0, 'review_text']

'This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. \n It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I\'ve read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. \n It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. (view spoiler)[Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. \n But what would happen if our SETI stations received a message - if we found someone was out there - and the person monitoring and answering the signal o

In [13]:
reviews_df.loc[0, 'cleaned_review_text']

'special book start slow first third middl third start get interest last third blew mind love good scienc fiction push think thing go hugo winner translat origin chines made interest differ way thing read instanc intermix chines revolutionari histori kept accus peopl reactionari etc book scienc alien scienc describ book impress book ground physic pretti accur far could tell view spoiler though got fold proton dimens think make stuff interest think though would happen seti station receiv messag found someon person monitor answer signal side disillus part book bit dark would like think human reaction discov alien civil hostil would like ender game would band togeth like book unveil trisolaran cultur game smart way build empathi also understand gone across mani centuri know bodi problem unsolv math problem still get made game mayb come next book love quot long histori scientif progress mani proton smash apart acceler physicist mani neutron electron probabl fewer hundr million everi collis