# Objective
- Understand the Dataset & perform the necessary cleanup.
- Build a strong Topic Modelling Algorithm to classify the topics.

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import nltk

In [None]:
df = pd.read_csv('product_reviews.csv')
df.head()

## From this we can see that this Dataset contains a lot of columns. For the purpose of our analyses, we only need a few

## For reference, here is a description of each column 

- **id:** Unique identifier for each product.
- **asins:** ASIN (Amazon Standard Identification Number) associated with the product.
- **brand:** Brand of the product.
- **categories:** Categories to which the product belongs.
- **colors:** Colors available for the product.
- **dateAdded:** Date when the product was added.
- **dateUpdated:** Date when the product information was last updated.
- **dimension:** Dimensions of the product.
- **ean:** EAN (European Article Number) associated with the product.
- **keys:** Unique keys associated with the product.
- **manufacturer:** Manufacturer of the product.
- **manufacturerNumber:** Manufacturer number for the product.
- **name:** Name of the product.
- **prices:** Prices associated with the product, including currency and date information.
- **reviews.date:** Date when the review was posted.
- **reviews.doRecommend:** Indicates whether the reviewer recommends the product.
- **reviews.numHelpful:** Number of users who found the review helpful.
- **reviews.rating:** Rating given by the reviewer.
- **reviews.sourceURLs:** URLs to the source of the reviews.
- **reviews.text:** Text content of the review.
- **reviews.title:** Title of the review.
- **reviews.userCity:** City of the reviewer.
- **reviews.userProvince:** Province of the reviewer.
- **reviews.username:** Username of the reviewer.
- **sizes:** Sizes available for the product.
- **upc:** UPC (Universal Product Code) associated with the product.
- **weight:** Weight of the product.


In [None]:
# To get an easier idea of all the columns we are working with, let us see how many exist
df.columns

In [None]:
# Lets make a new df including more of what is actually relevant
relevant_columns = ['id', 'asins', 'brand', 'categories', 'colors', 'manufacturer',
        'name', 'prices', 'reviews.date',
       'reviews.doRecommend', 'reviews.numHelpful', 'reviews.rating', 'reviews.text', 'reviews.title',
         'sizes', 'weight']
product_reviews = df[relevant_columns]
product_reviews.tail()

# Now that we have a dataset with more of the information we need, we have spotted that a few columns needs restructuring
### Specifically the prices column and the reviews date.

In [None]:
product_reviews['prices'][0]

In [None]:
product_reviews['reviews.date']

In [None]:

# Change format to datetime
product_reviews['reviews.date'] = pd.to_datetime(product_reviews['reviews.date'], format='ISO8601')

# Gets rid of milliseconds
product_reviews['reviews.date'] = product_reviews['reviews.date'].dt.strftime('%Y-%m-%d %H:%M:%S')
product_reviews['reviews.date'].dtype #still datetime but is stored as object

In [None]:
product_reviews['reviews.date']

In [None]:
# quick test to make sure things are working as intended
product_reviews['reviews.date'] > '2016-02-01'

## Now that the date is fixed, we will move on to fixing the price column


In [None]:
# For a refresher here are what values in the price column look like
prices_first_row = product_reviews['prices'][0]
print(prices_first_row)
print(type(prices_first_row))

In [None]:
product_reviews['prices'][220]

In [None]:
# it is a lot to take in so we'll adjust it to be more presentable
import json

# convert the value that is currently a str to a list with dictionaries
prices_1 = json.loads(prices_first_row)
print("before proper formatting; ", type(prices_1))

# makes it more presentable within json format
prices_1_format = json.dumps(prices_1, indent = 3)
print(prices_1_format)


## For our purposes, we only want prices in USD. With the example shown above we see that there can be multiple prices in USD
- The original price when not on sale and the sale price.

## With this knowledge, we'll create two extra columns to the product reviews table and store those prices in

In [None]:
#ensure all columns have a price in USD
len(product_reviews['prices'].str.contains("USD"))

In [None]:
# TODO: make a loop(hopefully with enumerate) that takes in the prices in USD for each item
full_prices = []
sale_prices = []

for i in product_reviews.index:
    list_dict = json.loads(product_reviews['prices'][i])

    # Initialize variables to store original and sale prices
    original_price = float(list_dict[0]['amountMax'])



    # Iterate through the list of dictionaries to find prices
    for price_info in list_dict:
        if price_info.get('currency') == 'USD' and price_info.get('isSale') == 'true':
            sale_price = float(price_info['amountMax'])
            break


    # Append prices to respective lists
    full_prices.append(original_price)
    sale_prices.append(sale_price)

In [None]:
# checking to ensure if the loop above needs to be adjusted to include a substitute value if there isnt a sale price
print(len(sale_prices),len(full_prices))


In [None]:
# Now we add two columns to showcase the two prices
product_reviews.insert(8,'fullPrice',full_prices)
product_reviews.insert(9,'salePrice',sale_prices)
product_reviews.head()


In [None]:
#now that this is done, we no longer need the original price column
product_reviews = product_reviews.drop(columns='prices')


In [None]:
product_reviews

## The data is finally clean and we will now move on to utilizing NLP for the following purposes
- elaborating on how positive each review is
    - creating a classification model to then support classifying the level of positivity
- topic of each review


In [None]:
# for an intro to the natural language processing toolkit and the different language packages it has. Close it when you've had a good view of the GUI
nltk.download()

In [None]:
nltk.download('vader_lexicon') # required to be used with sentiment analysis intensity
from nltk.sentiment import SentimentIntensityAnalyzer # for identifying the level of sentiment(neg to pos) of text

# class and function of sentiment intensity analysis
sia = SentimentIntensityAnalyzer()


In [None]:
# quick check to make sure all products have reviews.
product_reviews['reviews.text'].isnull().sum()

In [None]:
from langdetect import detect
from googletrans import Translator


translator = Translator()
sia = SentimentIntensityAnalyzer()

scores_data = []

for review in product_reviews['reviews.text']:
    # Check if the review is in English
    try:
        if detect(review) != 'en':
            # Translate non-English reviews to English
            translation = translator.translate(review, dest='en').text
            review = translation

        # Analyze sentiment for the (translated or original) review
        score = sia.polarity_scores(review)
        scores_data.append(score)
    except Exception as e:
        print(f"Error processing review: {e}")

scores_data[:20]


In [None]:
# Insert a column to store the positivity scores
product_reviews.insert(15,'positivityScore',[scores_data[i]['compound'] for i in range(len(scores_data))])

In [None]:
positivity_level = []

for i in product_reviews['positivityScore']:
    if .66 <= i <= 1:
        positivity_level.append("highly positive")
    elif .33 <= i < .66:
        positivity_level.append("positive")
    elif .1 <= i < .33:
        positivity_level.append("fairly positive")
    elif -.1 <= i < .1:
        positivity_level.append("neutral")
    elif -.33 <= i < -.1:
        positivity_level.append("fairly negative")
    elif -.66 <= i < -.33:
        positivity_level.append("negative")
    elif -1 <= i < -.66:
        positivity_level.append("highly negative")



product_reviews.insert(16,'positivityLevel',positivity_level)

In [None]:
product_reviews.head(3)

### Now we'll go over to creating the algorithm for identifying the topic within each review

In [None]:
# if you know a specific package that you want to download you can do it like what we
nltk.download('product_reviews_2')

In [None]:
# See how many files are within this dataset
from nltk.corpus import product_reviews_2
len(product_reviews_2.fileids())


In [None]:
# get a quick look at them
product_reviews_2.fileids()

In [None]:
# go deep into seeing one of them,
print(product_reviews_2.raw(fileids='Linksys_Router.txt'))