In [1]:
#Enable hardware acceleration
%env SCIKIT_LEARN_INTEL=SKLEARN

env: SCIKIT_LEARN_INTEL=SKLEARN


# Dataset Prepartion
## Dataset Loading
### Load the JSON files into Data Frame

In [None]:
import os
import pandas as pd

# Get the current directory
current_dir = os.getcwd()

# Define the relative file paths
fashion_json = os.path.join(current_dir, '../../data/raw_data/AMAZON_FASHION_5.json')
phones_json = os.path.join(current_dir, '../../data/raw_data/Cell_Phones_and_Accessories_5.json')
grocery_json = os.path.join(current_dir, '../../data/raw_data/Grocery_and_Gourmet_Food_5.json')

# Read JSON files into DataFrames
df_fashion = pd.read_json(fashion_json, lines=True)
df_phones = pd.read_json(phones_json, lines=True)
df_grocery = pd.read_json(grocery_json, lines=True)

### Combine the Datasets

In [None]:
# Combine the datasets
df = pd.concat([df_fashion, df_phones, df_grocery], ignore_index=True)
print("\nTotal number of reviews: ",df.shape[0])
df.head()

## Field Selection

In [None]:
df = df.rename({"reviewText" : "Reviews"}, axis=1)
df = df.rename({"overall" : "Score"}, axis=1)
df = df [["Score", "Reviews"]]
df.head()

## Missing and Duplicate Data Checks

In [None]:
def missing_checker(df):
    # Get the missing reviews
    missing_reviews = df['Reviews'].isnull()
    
    print("Missing data:\n", missing_reviews.sum())
    print(df[missing_reviews])
    
missing_checker(df)

In [None]:
# Drop Missing Data
df.dropna(inplace=True)

# Check again
missing_checker(df)

In [None]:
def duplicate_checker(df):
    # Get the duplicate reviews
    duplicate_reviews = df.duplicated(subset='Reviews')
    
    print("Duplicate Reviews:\n", duplicate_reviews.sum())
    print(df[duplicate_reviews])
    
duplicate_checker(df)

In [None]:
# Remove the duplicates but keep the first instance
df.drop_duplicates(subset='Reviews', keep='first', inplace=True)
duplicate_checker(df)

## Feedback Mapping of Scores

In [None]:
import numpy as np                       # MD array and Matrices

conditions = [
    (df['Score'] >= 4),
    (df['Score'] == 3),
    (df['Score'] <= 2)
    ]
feedback_values = ['Positive',
                   'Neutral',
                   'Negative']
df['Feedback'] = np.select(conditions, feedback_values)


feedback_counts = df['Feedback'].value_counts()
print(feedback_counts)

In [None]:
import matplotlib.pyplot as plt  # Data Visualization

df['Feedback'].value_counts().sort_index().plot.bar(color=['maroon', 'steelblue', 'limegreen'])

In [None]:
print("Total number of reviews:",df.shape[0])
df.head()

# Data Preprocessing

In [None]:
import sys
sys.path.append('..')
from data_preprocess import text_cleaner, stop_words

#Test the imported module
text = "I’m never gonna give you up. But I shouldn't say that I can't bear to lose you.\n https://youtube.com <strong> Bold and brash </strong> \n<a href='https://www.w3schools.com'>Visit W3Schools</a>"
cleaned_words = text_cleaner(text)
print(cleaned_words)

In [None]:
df['Reviews'] = df['Reviews'].apply(text_cleaner)

In [None]:
# Check for missing and dupe data again isn't substantial that text cleaner wiped it out.
missing_checker(df)
duplicate_checker(df)

In [None]:
# Drop
df.dropna(inplace=True)
missing_checker(df)
df.drop_duplicates(subset='Reviews', keep='first', inplace=True)
duplicate_checker(df)

In [None]:
df.head()

In [None]:
# Save the DataFrame to the new CSV file
df.to_csv(os.path.join(current_dir, '../../data/processed_data/prep_reviews.csv'), index=False, encoding='utf-8')

# PROCEED TO MODEL TRAINING