In [19]:
from pathlib import Path
import re
import dill
import pandas as pd

import unimib_snowit_project.utils as u

In [20]:
# Base Params

DATA_IN_DIR = 'data_input'
REVIEWS_IN_FILENAME = 'reviews.csv'                
REVIEWS_LABELLED_IN_FILENAME = 'reviews_labelled.csv' 

DATA_PKL_DIR = 'data_loaded'

REVIEWS_PKL_FILENAME = 'reviews.pkl'                 
REVIEWS_LABELLED_PKL_FILENAME = 'reviews_labelled.pkl'  

NA_VALUES = ['', ' ', '""',
             '#N/A', '#N/A N/A', '#NA', 'N/A', '<NA>', 'n/a', # 'NA',
             '-1.#IND', '1.#IND',
             '-1.#QNAN', '-NaN', '-nan', '-NAN', '1.#QNAN', 'NaN', 'nan', 'NAN',
             'NULL', 'Null', 'null',
             'NONE', 'None', 'none',
             ]

In [21]:
# Base paths

root_dir_path = u.get_root_dir()

data_in_dir_path = root_dir_path.joinpath(DATA_IN_DIR)
reviews_in_path = data_in_dir_path.joinpath(REVIEWS_IN_FILENAME)               
reviews_labelled_in_path = data_in_dir_path.joinpath(REVIEWS_LABELLED_IN_FILENAME)

data_pkl_dir_path = root_dir_path.joinpath(DATA_PKL_DIR)
reviews_pkl_path = data_pkl_dir_path.joinpath(REVIEWS_PKL_FILENAME)                
reviews_labelled_pkl_path = data_pkl_dir_path.joinpath(REVIEWS_LABELLED_PKL_FILENAME) 

## Load Reviews

In [22]:
safeload_reviews_df = pd.read_csv(reviews_in_path,
                                  dtype="string",
                                  na_values=[],
                                  keep_default_na=False
                                  )

In [23]:
safeload_reviews_df.columns

Index(['review.uid', 'user.uid', 'text'], dtype='object')

In [24]:
reviews_df = pd.read_csv(reviews_in_path,
                         keep_default_na=False,
                         na_values=NA_VALUES,
                         dtype={
                             "review.uid": "string",   
                             "user.uid": "string",     
                             "text": "string"          
                         }
                         )

reviews_df["text"] = (
    reviews_df["text"]
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

In [25]:
# CHECK PK VALIDITY

# SELECT count(1) as num_rows
# FROM reviews_df
# WHERE review.uid IS NULL

display(
    reviews_df
    .loc[lambda tbl: tbl["review.uid"].isnull()]
    .assign(aux=1.0)
    .shape[0]
)

# SELECT review.uid, count(1) as num_rows
# FROM reviews_df
# GROUP BY review.uid
# HAVING num_rows > 1

display(
    reviews_df
    .assign(aux=1.0)
    .groupby(["review.uid"], dropna=False)
    .agg(num_rows=("aux", pd.Series.count))
    .loc[lambda tbl: tbl["num_rows"] > 1]
)

0

Unnamed: 0_level_0,num_rows
review.uid,Unnamed: 1_level_1


## Load Reviews Labelled

In [26]:
safeload_reviews_labelled_df = pd.read_csv(reviews_labelled_in_path,
                                           dtype="string",
                                           na_values=[],
                                           keep_default_na=False
                                           )

safeload_reviews_labelled_df.columns

Index(['labelled_review.uid', 'text', 'sentiment_label'], dtype='object')

In [27]:
reviews_labelled_df = pd.read_csv(
    reviews_labelled_in_path,
    keep_default_na=False,
    na_values=NA_VALUES,
    dtype={
        "labelled_review.uid": "string",   
        "text": "string",                  
        "sentiment_label": "string"        
    }
)

reviews_labelled_df["text"] = (
    reviews_labelled_df["text"]
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

reviews_labelled_df["sentiment_label"] = (
    reviews_labelled_df["sentiment_label"]
    .str.strip()
    .str.lower()
)

In [28]:
# CHECK PK VALIDITY

# SELECT count(1)
# FROM reviews_labelled_df
# WHERE labelled_review.uid IS NULL

display(
    reviews_labelled_df
    .loc[lambda tbl: tbl["labelled_review.uid"].isnull()]
    .assign(aux=1.0)
    .shape[0]
)

# SELECT labelled_review.uid, count(1)
# FROM reviews_labelled_df
# GROUP BY labelled_review.uid
# HAVING count(1) > 1

display(
    reviews_labelled_df
    .assign(aux=1.0)
    .groupby(["labelled_review.uid"], dropna=False)
    .agg(num_rows=("aux", pd.Series.count))
    .loc[lambda tbl: tbl["num_rows"] > 1]
)

0

Unnamed: 0_level_0,num_rows
labelled_review.uid,Unnamed: 1_level_1


In [30]:
with reviews_pkl_path.open('wb') as fh:
    dill.dump(reviews_df, fh)
print(f"Save reviews data in {reviews_pkl_path.as_posix()}")

with reviews_labelled_pkl_path.open('wb') as fh:
    dill.dump(reviews_labelled_df, fh)
print(f"Save reviews labelled data in {reviews_labelled_pkl_path.as_posix()}")

Save reviews data in /Users/farabiisa/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/reviews.pkl
Save reviews labelled data in /Users/farabiisa/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/reviews_labelled.pkl


In [31]:
DATA_PKL_DIR = "data_loaded"

# Filenames
REVIEWS_PKL_FILENAME = "reviews.pkl"              
REVIEWS_LABELLED_PKL_FILENAME = "reviews_labelled.pkl" 

# Root directory
root_dir_path = u.get_root_dir()

# Base PKL directory
data_pkl_dir_path = root_dir_path / DATA_PKL_DIR

# PKL paths
reviews_pkl_path = data_pkl_dir_path / REVIEWS_PKL_FILENAME             
reviews_labelled_pkl_path = data_pkl_dir_path / REVIEWS_LABELLED_PKL_FILENAME 

# Loader
def load_pkl(pkl_path):
    with pkl_path.open("rb") as fh:
        return dill.load(fh)

# Load DataFrames
reviews_df = load_pkl(reviews_pkl_path)                       
reviews_labelled_df = load_pkl(reviews_labelled_pkl_path)     

In [33]:
reviews_df

Unnamed: 0,review.uid,user.uid,text
0,1,tcdf7enqg9sxqq4ie4qh3xzjbt,I have bought several of the Vitality canned d...
1,2,kszpqbggktbzmtkkkejthwyucw,Product arrived labeled as Jumbo Salted Peanut...
2,3,l01hczdvthqra6hsxbovmqipdc,This is a confection that has been around a fe...
3,4,cmszp7l1j9pt53fvt178v3avxd,If you are looking for the secret ingredient i...
4,5,bhimnczlhxddegbdaytb7gbhx2,Great taffy at a great price. There was a wide...
...,...,...,...
105705,140942,aucpq6co8tbk8qelhzezxa4rkm,"We Love,Love our Keurig coffee maker .My husba..."
105706,140943,bybi8rqc0odzbnbdlohrlqp9jt,This is supposed to be a light roast coffee bu...
105707,140944,lnp0swzvbkuhvqvdx0ctmjwn7v,I have always been happy with my Kcups but thi...
105708,140945,miyvfildj9cxsn27tqt3khp6gt,I ordered light roast coffee and was sent one ...


In [34]:
reviews_labelled_df

Unnamed: 0,labelled_review.uid,text,sentiment_label
0,0,I'm no bitters expert but I bought it as a gif...,neutral
1,1,"these are probably great in the right drinks, ...",neutral
2,2,I sent these to my dad for his bday and he sai...,positive
3,3,I purchased these as a gift for family member ...,positive
4,4,My wife bought me this sauce sampler for Chris...,neutral
...,...,...,...
462739,462739,Great for sesame chicken..this is a good if no...,positive
462740,462740,I'm disappointed with the flavor. The chocolat...,neutral
462741,462741,"These stars are small, so you can give 10-15 o...",positive
462742,462742,These are the BEST treats for training and rew...,positive
