# **I. INTRODUCTION**

- Batch: RMT-050

- Objective: Compile & clean raw data collected via `SimpleScraper`

# II. **IMPORT LIBRARIES**

In [1]:
import pandas as pd
from langdetect import detect
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load previous data
original_df = pd.read_csv('~/Documents/p2-ftds-final-project-ftds-050-rmt-group-001/data_cleaning/filtered.csv')
samsung_df = original_df[original_df['brand']=='Samsung']

# **III. RAW DATA LOADING**

Due to limited time & resource (API credit), the product review scraping is limited to 1 mobile phone brand, which is `Samsung` with 20 products only. Each product contains 10 reviews per rating tier (some products may have less than 10 reviews). Here's the following breakdown:

*(E.g.)* *Samsung Z Fold 7*:
- ⭐️⭐️⭐️⭐️⭐️  : 10 Reviews
- ⭐️⭐️⭐️⭐️  : 10 Reviews
- ⭐️⭐️⭐️    : 10 Reviews
- ⭐️⭐️ : 10 Reviews
- ⭐️ : 10 Reviews

## Z Fold 7

In [3]:
# Pakai yg 401
samsung_df[samsung_df['model_name']=='Galaxy Z Fold7'].url

401    https://www.amazon.com/Samsung-Smartphone-Unlo...
Name: url, dtype: object

In [4]:
# Join all data from Rating 1.0 - Rating 5.0
all_df= []
for i in range(5,0, -1):
    df = pd.read_json(f'Fold7/Z-fold7-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['80%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['8%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['4%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['3%']*len(df)
    else:
        df['rating_distribution'] = ['5%']*len(df)

    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [5]:
# Assigning values to current data
samsung_df["reviews"][401] = reviews
samsung_df["ratings"][401] = ratings

In [6]:
# new_df
df_finish = []
new_df = samsung_df.loc[401].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.5

# Append to compiler
df_finish.append(new_df)

## Galaxy S25 Ultra

In [7]:
# Index 385
samsung_df[samsung_df['model_name']=='Galaxy S25 Ultra'].url[385]

'https://www.amazon.com/SAMSUNG-S25-Ultra-Smartphone-Processor/dp/B0F39CGPQV/ref=sr_1_54?dib=eyJ2IjoiMSJ9.nHV-0jU4w3_sT9KvY_tVrEm1xAio1yPCrFVhT80olTTPmnzzWfo78Yh4calJxQf5eskN_QdZ1RTaXX2AyRyp4SLqTU-ieEBrBURVj3S4qDuO-_U2Brgha-v2zs1DLJZAHd9S9OI1oUaqHxQz2RTyw7lAoZ5VqEsxLjkrV1NoOYDgm5f-71CmfNSIA0RW2UWcLML9B_jyTJlYcvmZV7cXdzzAQv8pXVGDvz3MzH1zEag5ZANh1XWWUEBcIRf9I-Qvq7HpE9D1o7BTgUjBLNTF2_Bhunl5M-t6h5CPB-lrpgU.bgtaWsNhI1o15ZQf6u6uNqHyIiJi4MAX_JOWRUOXLPE&dib_tag=se&qid=1769857430&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-54&xpid=FEes0W8Q3djsz'

In [8]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S25-U/S25-Ultra-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)

    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['70%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['9%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['3%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['2%']*len(df)
    else:
        df['rating_distribution'] = ['16%']*len(df)
    
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [9]:
# Assigning values to current data
samsung_df["reviews"][385] = reviews
samsung_df["ratings"][385] = ratings

In [10]:
# new_df
new_df = samsung_df.loc[385].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.1

# Append to compiler
df_finish.append(new_df)

## S25 FE

In [11]:
# Index 343
samsung_df[samsung_df['model_name']=='Galaxy S25 FE'].url[343]

'https://www.amazon.com/Samsung-Smartphone-Unlocked-Res-Camera-Warranty/dp/B0FG1THCD7/ref=sr_1_5?dib=eyJ2IjoiMSJ9.IZthQETfgZ_94BMyZ4wxCZDSyykL3KflyE0J2qS9upHAUT_kwVjHlm79K9ZxYvKjpgZfvsrYtiiGnsUjuzAB8utTTWaPqFi31VL8gRCeq9VdrMUtIt9FXmShfw7jCOOnDUNa6fVJdPeeRhtPhZojc-CCDGdmcSLITd4dNNnsZcsC2k-ZEk1IjSTwBBATiW2Mj3w2mWUxNlbf9wlUK9-XR_Z7l80cwYoEXS0F_65Lf-iBXi8AETMpkPoxqPKKYuPWz9jrtD2viwzNYc82EdkA9I6SgMPwrfYtvAJrEwJXXRs.hTQzOK4_ECLs3fIneUVQ6FUlrl5YKDlfRhbTRABnc9w&dib_tag=se&qid=1769857411&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-5&xpid=FPUY1SF7P-F1N'

In [12]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S25-FE/S25-FE-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['82%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['8%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['3%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['1%']*len(df)
    else:
        df['rating_distribution'] = ['6%']*len(df)

    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()   
rating_distribution = final_df['rating_distribution'].to_list()

In [13]:
# Assigning values to current data
samsung_df["reviews"][343] = reviews
samsung_df["ratings"][343] = ratings

In [14]:
# new_df
new_df = samsung_df.loc[343].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.6

# Append to compiler
df_finish.append(new_df)

## Galaxy Z Fold 6

In [15]:
# Index 390
samsung_df[samsung_df['model_name']=='Samsung Galaxy Fold 6'].url[390]

'https://www.amazon.com/SAMSUNG-Galaxy-Version-512GB-Crafted/dp/B0DJTC9CJP/ref=sr_1_61?dib=eyJ2IjoiMSJ9.nHV-0jU4w3_sT9KvY_tVrEm1xAio1yPCrFVhT80olTTPmnzzWfo78Yh4calJxQf5eskN_QdZ1RTaXX2AyRyp4SLqTU-ieEBrBURVj3S4qDuO-_U2Brgha-v2zs1DLJZAHd9S9OI1oUaqHxQz2RTyw7lAoZ5VqEsxLjkrV1NoOYDgm5f-71CmfNSIA0RW2UWcLML9B_jyTJlYcvmZV7cXdzzAQv8pXVGDvz3MzH1zEag5ZANh1XWWUEBcIRf9I-Qvq7HpE9D1o7BTgUjBLNTF2_Bhunl5M-t6h5CPB-lrpgU.bgtaWsNhI1o15ZQf6u6uNqHyIiJi4MAX_JOWRUOXLPE&dib_tag=se&qid=1769857430&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-61&xpid=FEes0W8Q3djsz'

In [16]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Fold6/Z-fold6-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['69%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['14%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['8%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['1%']*len(df)
    else:
        df['rating_distribution'] = ['8%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()   
rating_distribution = final_df['rating_distribution'].to_list()

In [17]:
# Assigning values to current data
samsung_df["reviews"][390] = reviews
samsung_df["ratings"][390] = ratings

In [18]:
# new_df
new_df = samsung_df.loc[390].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.3

# Append to compiler
df_finish.append(new_df)

## Galaxy Flip 7 FE

In [19]:
# Index 411
samsung_df[samsung_df['model_name']=='Galaxy Z Flip7 FE'].url[411]

'https://www.amazon.com/Samsung-Smartphone-Unlocked-Manufacturer-Warranty/dp/B0F7JZFKMQ/ref=sr_1_94?dib=eyJ2IjoiMSJ9.iBcSzgpkJScsrkfbMhxKA5CsbzbHxNE1s00VSqn7feu6vH1H1uYyS2S0c7osSQlwHMulp1s-poFk5c7ds4rqpivfgGlW_j5aHy7Nmou5vlG6n7DfPRLcSDp3f_p6m3FplWJAvpqA3qbbhtTkf8IgKJ1L_hZQ4I3l1EAvIaLAQ0MvyD_xQgUCS2_ZhX2X16ohJkp9CNQEgkkbwcN4JRD9QefbwdXkd439RVRL2CzglbrWD1uOkk-S1DEXBkE8Nbxu-gkw-RTsSZbZY8UbZ4MZGLWZOf4COecZBoGf5jVXUYw.rO0iqTh-Gar67tBCDR0IxVwBs79S8E397uKGkFyL9gs&dib_tag=se&qid=1769857440&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-94&xpid=FEes0W8Q3djsz'

In [20]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    if i == 2:
        continue
    df = pd.read_json(f'Flip7/Flip7-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)

    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['78%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['11%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['0%']*len(df)
    else:
        df['rating_distribution'] = ['6%']*len(df)

    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()   

No 2-star rating

In [21]:
# Assigning values to current data
samsung_df["reviews"][411] = reviews
samsung_df["ratings"][411] = ratings

In [22]:
# new_df
new_df = samsung_df.loc[411].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.5

# Append to compiler
df_finish.append(new_df)

## Galaxy S24 Ultra

In [23]:
# Index 381
samsung_df[samsung_df['model_name']=='Samsung Galaxy S24 Ultra 5G'].url[381]

'https://www.amazon.com/SAMSUNG-Galaxy-S24-Ultra-Unlocked/dp/B0D364M6TM/ref=sr_1_50?dib=eyJ2IjoiMSJ9.nHV-0jU4w3_sT9KvY_tVrEm1xAio1yPCrFVhT80olTTPmnzzWfo78Yh4calJxQf5eskN_QdZ1RTaXX2AyRyp4SLqTU-ieEBrBURVj3S4qDuO-_U2Brgha-v2zs1DLJZAHd9S9OI1oUaqHxQz2RTyw7lAoZ5VqEsxLjkrV1NoOYDgm5f-71CmfNSIA0RW2UWcLML9B_jyTJlYcvmZV7cXdzzAQv8pXVGDvz3MzH1zEag5ZANh1XWWUEBcIRf9I-Qvq7HpE9D1o7BTgUjBLNTF2_Bhunl5M-t6h5CPB-lrpgU.bgtaWsNhI1o15ZQf6u6uNqHyIiJi4MAX_JOWRUOXLPE&dib_tag=se&qid=1769857430&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-50&xpid=FEes0W8Q3djsz'

In [24]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S24-U/S24-Ultra-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)

    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['73%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['11%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['2%']*len(df)
    else:
        df['rating_distribution'] = ['9%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list() 

In [25]:
# Assigning values to current data
samsung_df["reviews"][381] = reviews
samsung_df["ratings"][381] = ratings

In [26]:
# new_df
new_df = samsung_df.loc[381].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.3

# Append to compiler
df_finish.append(new_df)

## Galaxy S23 Ultra

In [27]:
# Index 370
samsung_df[samsung_df['model_name']=='Samsung Galaxy S23 Ultra 5G'].url[370]

'https://www.amazon.com/SAMSUNG-Galaxy-Ultra-Factory-Unlocked/dp/B0C51Q5Z9K/ref=sr_1_36?dib=eyJ2IjoiMSJ9.WK2dYbm7jqrYKzvA_635ffkLZ3yoSlXjjiBx3F0R20HmsmtMFXl5utBxuLtzl5LveVauZfgCiFE7SxRMserK_1mgHMXXPPVUGcdDXZefKQAsIrY-Wm-zLAyyvl1RkJ3o8bA9BQVQYNzoSdchdumoieYo8XyNiGl0Ajq2xITovIuQw9ki618C47OutM78aRW8A65JU5yHaXfWM36FP9oaIgFm2Hfombm727J5m8MN8YN4gyH9BWu8gMnNPt6MyB36UuVFV-wolI2QNcAH0A7Sa2L8RBoPeK0PPOsbJQdJssY.tkVezDMD8UzL1KpR_5lMSbbNxFiCNpZmG59J9O5Yy0A&dib_tag=se&qid=1769857421&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-36&xpid=FEes0W8Q3djsz'

In [28]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S23-U/S23-Ultra-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['64%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['10%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['7%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['5%']*len(df)
    else:
        df['rating_distribution'] = ['14%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list() 

In [29]:
# Assigning values to current data
samsung_df["reviews"][370] = reviews
samsung_df["ratings"][370] = ratings

In [30]:
# new_df
new_df = samsung_df.loc[370].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.0

# Append to compiler
df_finish.append(new_df)

## Galaxy S25

In [31]:
# Index 381
samsung_df[samsung_df['model_name']=='Galaxy S25'].url[399]

'https://www.amazon.com/SAMSUNG-Smartphone-Unlocked-Processor-ProScaler/dp/B0F3DJJR6W/ref=sr_1_72?dib=eyJ2IjoiMSJ9.nHV-0jU4w3_sT9KvY_tVrEm1xAio1yPCrFVhT80olTTPmnzzWfo78Yh4calJxQf5eskN_QdZ1RTaXX2AyRyp4SLqTU-ieEBrBURVj3S4qDuO-_U2Brgha-v2zs1DLJZAHd9S9OI1oUaqHxQz2RTyw7lAoZ5VqEsxLjkrV1NoOYDgm5f-71CmfNSIA0RW2UWcLML9B_jyTJlYcvmZV7cXdzzAQv8pXVGDvz3MzH1zEag5ZANh1XWWUEBcIRf9I-Qvq7HpE9D1o7BTgUjBLNTF2_Bhunl5M-t6h5CPB-lrpgU.bgtaWsNhI1o15ZQf6u6uNqHyIiJi4MAX_JOWRUOXLPE&dib_tag=se&qid=1769857430&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-72&xpid=FEes0W8Q3djsz'

In [32]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S25/S25-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['69%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['8%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['3%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['1%']*len(df)
    else:
        df['rating_distribution'] = ['19%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()  

In [33]:
# Assigning values to current data
samsung_df["reviews"][399] = reviews
samsung_df["ratings"][399] = ratings

In [34]:
# new_df
new_df = samsung_df.loc[399].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.0

# Append to compiler
df_finish.append(new_df)

## Galaxy S23

In [35]:
# Index 366
samsung_df[samsung_df['model_name']=='Galaxy S23'].url[366]

'https://www.amazon.com/SAMSUNG-Galaxy-S23-Version-Phantom/dp/B0C5S83319/ref=sr_1_32?dib=eyJ2IjoiMSJ9.WK2dYbm7jqrYKzvA_635ffkLZ3yoSlXjjiBx3F0R20HmsmtMFXl5utBxuLtzl5LveVauZfgCiFE7SxRMserK_1mgHMXXPPVUGcdDXZefKQAsIrY-Wm-zLAyyvl1RkJ3o8bA9BQVQYNzoSdchdumoieYo8XyNiGl0Ajq2xITovIuQw9ki618C47OutM78aRW8A65JU5yHaXfWM36FP9oaIgFm2Hfombm727J5m8MN8YN4gyH9BWu8gMnNPt6MyB36UuVFV-wolI2QNcAH0A7Sa2L8RBoPeK0PPOsbJQdJssY.tkVezDMD8UzL1KpR_5lMSbbNxFiCNpZmG59J9O5Yy0A&dib_tag=se&qid=1769857421&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-32&xpid=FEes0W8Q3djsz'

In [36]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S23/S23-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['67%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['13%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['3%']*len(df)
    else:
        df['rating_distribution'] = ['12%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()  

In [37]:
# Assigning values to current data
samsung_df["reviews"][366] = reviews
samsung_df["ratings"][366] = ratings

In [38]:
# new_df
new_df = samsung_df.loc[366].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.2

# Append to compiler
df_finish.append(new_df)

## Galaxy Z Fold 4

In [39]:
# Index 419
samsung_df[samsung_df['model_name']=='Galaxy Z Fold 4'].url[419]

'https://www.amazon.com/Samsung-Galaxy-Fold-Version-Phantom/dp/B0BKTJH8NZ/ref=sr_1_103?dib=eyJ2IjoiMSJ9.qNIx57btdigk7s-mUMFUobuAO57swX8T0jH6ivWxbg3qxKp2kE--JlgCh6bAA0tjya2MPMWYeWO7sqee5OvY4kEHYw4VuKDKtAk24cat8a40RkhYipj_QQrhVDlBSkhzl_h0GC9HgeLDFJv7xXfzDX53SkXpTvH74mJqAhe1TKsdoyQj3goCDER38gXeh6w42mTu7D5UfC9leoBlhozLbA5OSBAo3gH5he-SeYf_pVNKZ9MlMum8p1HhOue7TGaIXQO6H3XVr0FiZB2HsjXmSaPIUYZZgg34IPmNinDH6SI.XK11Kcl-WiM3o53GKBpm19Sn8CK-QLGWH7R29_FoGuU&dib_tag=se&qid=1769857449&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-103&xpid=FEes0W8Q3djsz'

In [40]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Fold4/Z-fold4-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['59%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['11%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['6%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['4%']*len(df)
    else:
        df['rating_distribution'] = ['20%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [41]:
# Assigning values to current data
samsung_df["reviews"][419] = reviews
samsung_df["ratings"][419] = ratings

In [42]:
# new_df
new_df = samsung_df.loc[419].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 3.8

# Append to compiler
df_finish.append(new_df)

## Galaxy Z Flip 6

In [43]:
# Index 407
samsung_df[samsung_df['model_name']=='Samsung Galaxy Z Flip 6'].url[407]

'https://www.amazon.com/Samsung-Unlocked-Smartphone-Camcorder-Interpreter/dp/B0F2BPKXHW/ref=sr_1_89?dib=eyJ2IjoiMSJ9.iBcSzgpkJScsrkfbMhxKA5CsbzbHxNE1s00VSqn7feu6vH1H1uYyS2S0c7osSQlwHMulp1s-poFk5c7ds4rqpivfgGlW_j5aHy7Nmou5vlG6n7DfPRLcSDp3f_p6m3FplWJAvpqA3qbbhtTkf8IgKJ1L_hZQ4I3l1EAvIaLAQ0MvyD_xQgUCS2_ZhX2X16ohJkp9CNQEgkkbwcN4JRD9QefbwdXkd439RVRL2CzglbrWD1uOkk-S1DEXBkE8Nbxu-gkw-RTsSZbZY8UbZ4MZGLWZOf4COecZBoGf5jVXUYw.rO0iqTh-Gar67tBCDR0IxVwBs79S8E397uKGkFyL9gs&dib_tag=se&qid=1769857440&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-89&xpid=FEes0W8Q3djsz'

In [44]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Flip6/Flip6-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['69%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['9%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['3%']*len(df)
    else:
        df['rating_distribution'] = ['14%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [45]:
# Assigning values to current data
samsung_df["reviews"][407] = reviews
samsung_df["ratings"][407] = ratings

In [46]:
# new_df
new_df = samsung_df.loc[407].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.1

# Append to compiler
df_finish.append(new_df)

## Galaxy S24

In [47]:
# Index 368
samsung_df[samsung_df['model_name']=='Galaxy S24 5G'].url[368]

'https://www.amazon.com/SAMSUNG-Galaxy-S24-Version-128GB/dp/B0D364RMMP/ref=sr_1_34?dib=eyJ2IjoiMSJ9.WK2dYbm7jqrYKzvA_635ffkLZ3yoSlXjjiBx3F0R20HmsmtMFXl5utBxuLtzl5LveVauZfgCiFE7SxRMserK_1mgHMXXPPVUGcdDXZefKQAsIrY-Wm-zLAyyvl1RkJ3o8bA9BQVQYNzoSdchdumoieYo8XyNiGl0Ajq2xITovIuQw9ki618C47OutM78aRW8A65JU5yHaXfWM36FP9oaIgFm2Hfombm727J5m8MN8YN4gyH9BWu8gMnNPt6MyB36UuVFV-wolI2QNcAH0A7Sa2L8RBoPeK0PPOsbJQdJssY.tkVezDMD8UzL1KpR_5lMSbbNxFiCNpZmG59J9O5Yy0A&dib_tag=se&qid=1769857421&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-34&xpid=FEes0W8Q3djsz'

In [48]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S24/S24-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['66%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['11%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['3%']*len(df)
    else:
        df['rating_distribution'] = ['15%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [49]:
# Assigning values to current data
samsung_df["reviews"][368] = reviews
samsung_df["ratings"][368] = ratings

In [50]:
# new_df
new_df = samsung_df.loc[368].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.1

# Append to compiler
df_finish.append(new_df)

## Galaxy S23+

In [51]:
# Index 363
samsung_df[samsung_df['model_name']=='Galaxy S23+ 5G'].url[363]

'https://www.amazon.com/SAMSUNG-Galaxy-S23-Plus-Smartphone/dp/B0C4Q796XC/ref=sr_1_27?dib=eyJ2IjoiMSJ9.WK2dYbm7jqrYKzvA_635ffkLZ3yoSlXjjiBx3F0R20HmsmtMFXl5utBxuLtzl5LveVauZfgCiFE7SxRMserK_1mgHMXXPPVUGcdDXZefKQAsIrY-Wm-zLAyyvl1RkJ3o8bA9BQVQYNzoSdchdumoieYo8XyNiGl0Ajq2xITovIuQw9ki618C47OutM78aRW8A65JU5yHaXfWM36FP9oaIgFm2Hfombm727J5m8MN8YN4gyH9BWu8gMnNPt6MyB36UuVFV-wolI2QNcAH0A7Sa2L8RBoPeK0PPOsbJQdJssY.tkVezDMD8UzL1KpR_5lMSbbNxFiCNpZmG59J9O5Yy0A&dib_tag=se&qid=1769857421&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-27&xpid=FEes0W8Q3djsz'

In [52]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S23+/S23+-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['70%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['12%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['2%']*len(df)
    else:
        df['rating_distribution'] = ['11%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list() 

In [53]:
# Assigning values to current data
samsung_df["reviews"][363] = reviews
samsung_df["ratings"][363] = ratings

In [54]:
# new_df
new_df = samsung_df.loc[363].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.2

# Append to compiler
df_finish.append(new_df)

## Galaxy Z Flip 5

In [55]:
# Index 403
samsung_df[samsung_df['model_name']=='Galaxy Z Flip5 5G'].url[403]

'https://www.amazon.com/SAMSUNG-Galaxy-Unlocked-Smartphone-Graphite/dp/B0CN556RQ4/ref=sr_1_81?dib=eyJ2IjoiMSJ9.iBcSzgpkJScsrkfbMhxKA5CsbzbHxNE1s00VSqn7feu6vH1H1uYyS2S0c7osSQlwHMulp1s-poFk5c7ds4rqpivfgGlW_j5aHy7Nmou5vlG6n7DfPRLcSDp3f_p6m3FplWJAvpqA3qbbhtTkf8IgKJ1L_hZQ4I3l1EAvIaLAQ0MvyD_xQgUCS2_ZhX2X16ohJkp9CNQEgkkbwcN4JRD9QefbwdXkd439RVRL2CzglbrWD1uOkk-S1DEXBkE8Nbxu-gkw-RTsSZbZY8UbZ4MZGLWZOf4COecZBoGf5jVXUYw.rO0iqTh-Gar67tBCDR0IxVwBs79S8E397uKGkFyL9gs&dib_tag=se&qid=1769857440&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-81&xpid=FEes0W8Q3djsz'

In [56]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Flip5/Flip5-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['67%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['8%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['5%']*len(df)
    else:
        df['rating_distribution'] = ['15%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list() 

In [57]:
# Assigning values to current data
samsung_df["reviews"][403] = reviews
samsung_df["ratings"][403] = ratings

In [58]:
# new_df
new_df = samsung_df.loc[403].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.1

# Append to compiler
df_finish.append(new_df)

## Galaxy S22 Ultra

In [59]:
# Index 344
samsung_df[samsung_df['model_name']=='Galaxy S22 Ultra 5G'].url[344]

'https://www.amazon.com/Samsung-Galaxy-S22-5G-Unlocked/dp/B09VD33WHW/ref=sr_1_7?dib=eyJ2IjoiMSJ9.IZthQETfgZ_94BMyZ4wxCZDSyykL3KflyE0J2qS9upHAUT_kwVjHlm79K9ZxYvKjpgZfvsrYtiiGnsUjuzAB8utTTWaPqFi31VL8gRCeq9VdrMUtIt9FXmShfw7jCOOnDUNa6fVJdPeeRhtPhZojc-CCDGdmcSLITd4dNNnsZcsC2k-ZEk1IjSTwBBATiW2Mj3w2mWUxNlbf9wlUK9-XR_Z7l80cwYoEXS0F_65Lf-iBXi8AETMpkPoxqPKKYuPWz9jrtD2viwzNYc82EdkA9I6SgMPwrfYtvAJrEwJXXRs.hTQzOK4_ECLs3fIneUVQ6FUlrl5YKDlfRhbTRABnc9w&dib_tag=se&qid=1769857411&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-7&xpid=FPUY1SF7P-F1N'

In [60]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S22-U/S22-Ultra-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['57%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['11%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['5%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['5%']*len(df)
    else:
        df['rating_distribution'] = ['22%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list() 

In [61]:
# Assigning values to current data
samsung_df["reviews"][344] = reviews
samsung_df["ratings"][344] = ratings

In [62]:
# new_df
new_df = samsung_df.loc[344].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 3.7

# Append to compiler
df_finish.append(new_df)

## Galaxy Note 20 Ultra

In [63]:
# Index 376
samsung_df[samsung_df['model_name']=='Galaxy Note 20 Ultra 5G'].url[376]

'https://www.amazon.com/Samsung-Galaxy-Unlocked-Android-Smartphone/dp/B08HL64JS8/ref=sr_1_44?dib=eyJ2IjoiMSJ9.WK2dYbm7jqrYKzvA_635ffkLZ3yoSlXjjiBx3F0R20HmsmtMFXl5utBxuLtzl5LveVauZfgCiFE7SxRMserK_1mgHMXXPPVUGcdDXZefKQAsIrY-Wm-zLAyyvl1RkJ3o8bA9BQVQYNzoSdchdumoieYo8XyNiGl0Ajq2xITovIuQw9ki618C47OutM78aRW8A65JU5yHaXfWM36FP9oaIgFm2Hfombm727J5m8MN8YN4gyH9BWu8gMnNPt6MyB36UuVFV-wolI2QNcAH0A7Sa2L8RBoPeK0PPOsbJQdJssY.tkVezDMD8UzL1KpR_5lMSbbNxFiCNpZmG59J9O5Yy0A&dib_tag=se&qid=1769857421&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-44&xpid=FEes0W8Q3djsz'

In [64]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Note20-U/Note20-Ultra-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['59%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['11%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['8%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['4%']*len(df)
    else:
        df['rating_distribution'] = ['18%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [65]:
# Assigning values to current data
samsung_df["reviews"][376] = reviews
samsung_df["ratings"][376] = ratings

In [66]:
# new_df
new_df = samsung_df.loc[376].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 3.9

# Append to compiler
df_finish.append(new_df)

## Galaxy Note 10+

In [67]:
# Index 376
samsung_df[samsung_df['model_name']=='Galaxy Note 10 Plus'].url[424]

'https://www.amazon.com/Samsung-Galaxy-Note-256GB-Silver/dp/B08KRKFHGV/ref=sr_1_112?dib=eyJ2IjoiMSJ9.qNIx57btdigk7s-mUMFUobuAO57swX8T0jH6ivWxbg3qxKp2kE--JlgCh6bAA0tjya2MPMWYeWO7sqee5OvY4kEHYw4VuKDKtAk24cat8a40RkhYipj_QQrhVDlBSkhzl_h0GC9HgeLDFJv7xXfzDX53SkXpTvH74mJqAhe1TKsdoyQj3goCDER38gXeh6w42mTu7D5UfC9leoBlhozLbA5OSBAo3gH5he-SeYf_pVNKZ9MlMum8p1HhOue7TGaIXQO6H3XVr0FiZB2HsjXmSaPIUYZZgg34IPmNinDH6SI.XK11Kcl-WiM3o53GKBpm19Sn8CK-QLGWH7R29_FoGuU&dib_tag=se&qid=1769857449&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-112&xpid=FEes0W8Q3djsz'

In [68]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Note10+/Note10+-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['68%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['13%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['6%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['2%']*len(df)
    else:
        df['rating_distribution'] = ['11%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [69]:
# Assigning values to current data
samsung_df["reviews"][424] = reviews
samsung_df["ratings"][424] = ratings

In [70]:
# new_df
new_df = samsung_df.loc[424].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.2

# Append to compiler
df_finish.append(new_df)

## Galaxy S22+

In [71]:
# Index 380
samsung_df[samsung_df['model_name']=='Samsung S22+'].url[380]

'https://www.amazon.com/SAMSUNG-Galaxy-S22-Version-Phantom/dp/B0B6D53G2V/ref=sr_1_49?dib=eyJ2IjoiMSJ9.nHV-0jU4w3_sT9KvY_tVrEm1xAio1yPCrFVhT80olTTPmnzzWfo78Yh4calJxQf5eskN_QdZ1RTaXX2AyRyp4SLqTU-ieEBrBURVj3S4qDuO-_U2Brgha-v2zs1DLJZAHd9S9OI1oUaqHxQz2RTyw7lAoZ5VqEsxLjkrV1NoOYDgm5f-71CmfNSIA0RW2UWcLML9B_jyTJlYcvmZV7cXdzzAQv8pXVGDvz3MzH1zEag5ZANh1XWWUEBcIRf9I-Qvq7HpE9D1o7BTgUjBLNTF2_Bhunl5M-t6h5CPB-lrpgU.bgtaWsNhI1o15ZQf6u6uNqHyIiJi4MAX_JOWRUOXLPE&dib_tag=se&qid=1769857430&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-49&xpid=FEes0W8Q3djsz'

In [72]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S22+/S22+-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['72%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['12%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['7%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['3%']*len(df)
    else:
        df['rating_distribution'] = ['6%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [73]:
# Assigning values to current data
samsung_df["reviews"][380] = reviews
samsung_df["ratings"][380] = ratings

In [74]:
# new_df
new_df = samsung_df.loc[380].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.4

# Append to compiler
df_finish.append(new_df)

## Galaxy Z Flip 4

In [75]:
# Index 422
samsung_df[samsung_df['model_name']=='Galaxy Z Flip 4'].url[422]

'https://www.amazon.com/Samsung-Galaxy-Flip-Version-Graphite/dp/B0BKTWLR31/ref=sr_1_110?dib=eyJ2IjoiMSJ9.qNIx57btdigk7s-mUMFUobuAO57swX8T0jH6ivWxbg3qxKp2kE--JlgCh6bAA0tjya2MPMWYeWO7sqee5OvY4kEHYw4VuKDKtAk24cat8a40RkhYipj_QQrhVDlBSkhzl_h0GC9HgeLDFJv7xXfzDX53SkXpTvH74mJqAhe1TKsdoyQj3goCDER38gXeh6w42mTu7D5UfC9leoBlhozLbA5OSBAo3gH5he-SeYf_pVNKZ9MlMum8p1HhOue7TGaIXQO6H3XVr0FiZB2HsjXmSaPIUYZZgg34IPmNinDH6SI.XK11Kcl-WiM3o53GKBpm19Sn8CK-QLGWH7R29_FoGuU&dib_tag=se&qid=1769857449&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-110&xpid=FEes0W8Q3djsz'

In [76]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'Flip4/Flip4-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['57%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['12%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['6%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['4%']*len(df)
    else:
        df['rating_distribution'] = ['21%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [77]:
# Assigning values to current data
samsung_df["reviews"][422] = reviews
samsung_df["ratings"][422] = ratings

In [78]:
# new_df
new_df = samsung_df.loc[422].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 3.8

# Append to compiler
df_finish.append(new_df)

## Galaxy S23 FE

In [79]:
# Index 422
samsung_df[samsung_df['model_name']=='Samsung Galaxy S23 FE'].url[365]

'https://www.amazon.com/Samsung-Galaxy-Version-128GB-Black/dp/B0CRPQ8W8X/ref=sr_1_31?dib=eyJ2IjoiMSJ9.WK2dYbm7jqrYKzvA_635ffkLZ3yoSlXjjiBx3F0R20HmsmtMFXl5utBxuLtzl5LveVauZfgCiFE7SxRMserK_1mgHMXXPPVUGcdDXZefKQAsIrY-Wm-zLAyyvl1RkJ3o8bA9BQVQYNzoSdchdumoieYo8XyNiGl0Ajq2xITovIuQw9ki618C47OutM78aRW8A65JU5yHaXfWM36FP9oaIgFm2Hfombm727J5m8MN8YN4gyH9BWu8gMnNPt6MyB36UuVFV-wolI2QNcAH0A7Sa2L8RBoPeK0PPOsbJQdJssY.tkVezDMD8UzL1KpR_5lMSbbNxFiCNpZmG59J9O5Yy0A&dib_tag=se&qid=1769857421&refinements=p_123%3A46655&rnid=85457740011&s=wireless&sr=1-31&xpid=FEes0W8Q3djsz'

In [80]:
# Join all data from Rating 1.0 - Rating 5.0
all_df = []
for i in range(5,0, -1):
    df = pd.read_json(f'S23-FE/S23-FE-{i}star.json')
    df['rating'] = [f'{i}.0']*len(df)
    
    # Add rating % distribution data
    if i == 5:
        df['rating_distribution'] = ['72%']*len(df)
    elif i == 4:
        df['rating_distribution'] = ['13%']*len(df)    
    elif i == 3:
        df['rating_distribution'] = ['4%']*len(df)
    elif i == 2:
        df['rating_distribution'] = ['2%']*len(df)
    else:
        df['rating_distribution'] = ['9%']*len(df)
    
    all_df.append(df)

final_df = pd.concat(all_df, ignore_index=True)

# Extract all reviews
reviews = final_df['review-text'].to_list()
ratings = final_df['rating'].astype('float64').to_list()
rating_distribution = final_df['rating_distribution'].to_list()

In [81]:
# Assigning values to current data
samsung_df["reviews"][365] = reviews
samsung_df["ratings"][365] = ratings

In [82]:
# new_df
new_df = samsung_df.loc[365].to_frame().T
new_df = new_df.drop(columns='Unnamed: 0').reset_index(drop=True)

# Add overall rating
new_df['rating_distribution'] = [rating_distribution]
new_df['overall_rating'] = 4.3

# Append to compiler
df_finish.append(new_df)

In [83]:
# Show final df
df_finish2 = pd.concat(df_finish, ignore_index=True)
df_finish2

Unnamed: 0,title,price,battery,weight,description,reviews,ratings,url,img_url,brand,...,cpu_model,cpu_speed,storage,screen_size,resolution,refresh_rate,model_name,weight_gr,rating_distribution,overall_rating
0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,7.6 ounces,"Expand what’s possible with Galaxy Z Fold7, th...",[I’ve used Samsung’s Ultra and Note series for...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,...,Others,4.7,512,8.0,1968 x 2184 pixels,120,Galaxy Z Fold7,215.46,"[80%, 80%, 80%, 80%, 80%, 80%, 80%, 80%, 80%, ...",4.5
1,"SAMSUNG Galaxy S25 Ultra Cell Phone, 256GB AI ...",768.98,5000,7.7 ounces,"Experience the Samsung Galaxy S25 Ultra, a pow...",[Good price for a good phone. This phone looks...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/SAMSUNG-S25-Ultra-Smart...,https://m.media-amazon.com/images/I/61n0lmxP5-...,Samsung,...,Snapdragon,4.47,256,6.9,1440 x 3120,120,Galaxy S25 Ultra,218.3,"[70%, 70%, 70%, 70%, 70%, 70%, 70%, 70%, 70%, ...",4.1
2,"Samsung Galaxy S25 FE Cell Phone (2025), 256GB...",679.99,4900,6.7 ounces,Powerful where it counts and smart where it ma...,[This is a very nice phone at a reasonable pri...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/51Q0M3d7Qd...,Samsung,...,Exynos 2400 S5E9945,2.2,256,6.7,2340 x 1080,120,Galaxy S25 FE,189.95,"[82%, 82%, 82%, 82%, 82%, 82%, 82%, 82%, 82%, ...",4.6
3,"Samsung Galaxy Z Fold 6 5G US Version, 512GB, ...",674.99,4400,8.3 ounces,The Samsung Galaxy Z Fold6 SM-F956U combines c...,"[This is my first foldable, coming from the No...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/SAMSUNG-Galaxy-Version-...,https://m.media-amazon.com/images/I/61L8NIa+Gj...,Samsung,...,Snapdragon,3.39,512,7.6,2160 x 1856,120,Samsung Galaxy Fold 6,235.31,"[69%, 69%, 69%, 69%, 69%, 69%, 69%, 69%, 69%, ...",4.3
4,"Samsung Galaxy Z Flip7 FE Cell Phone, 256GB AI...",639.99,4000,11.6 ounces,Convenience is key when it comes to your every...,[The product arrived as it should. It works gr...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/619w3H2V-3...,Samsung,...,Exynos 2400 S5E9945,3.2,256,6.7,1080 x 2400,120,Galaxy Z Flip7 FE,328.86,"[78%, 78%, 78%, 78%, 78%, 78%, 78%, 78%, 78%, ...",4.5
5,SAMSUNG Galaxy S24 Ultra 5G Factory Unlocked 2...,574.99,5000,8.2 ounces,Do more with the most epic Galaxy yet. Wonderi...,[I ordered this phone back in February & back ...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/SAMSUNG-Galaxy-S24-Ultr...,https://m.media-amazon.com/images/I/51E3rux4Dg...,Samsung,...,Snapdragon,2.9,256,6.8,1440 x 2960,120,Samsung Galaxy S24 Ultra 5G,232.47,"[73%, 73%, 73%, 73%, 73%, 73%, 73%, 73%, 73%, ...",4.3
6,SAMSUNG Galaxy S23 Ultra (Renewed) 5G Factory ...,449.0,5000,8.3 ounces,PRODUCT OVERVIEW The Samsung Galaxy S23 Series...,[I was hesitant to upgrade my trusty S10 but i...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/SAMSUNG-Galaxy-Ultra-Fa...,https://m.media-amazon.com/images/I/51ZZO2wp8E...,Samsung,...,Snapdragon,3.36,512,6.8,1440 x 3200,120,Samsung Galaxy S23 Ultra 5G,235.31,"[64%, 64%, 64%, 64%, 64%, 64%, 64%, 64%, 64%, ...",4.0
7,"SAMSUNG Galaxy S25 Cell Phone, 128GB AI Smartp...",439.98,4000,5.7 ounces,Make life easier with an epic phone that reall...,[I almost didn’t buy this phone because of som...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/SAMSUNG-Smartphone-Unlo...,https://m.media-amazon.com/images/I/61C17Al0dh...,Samsung,...,Snapdragon,4.47,128,6.2,2340 x 1080,120,Galaxy S25,161.6,"[69%, 69%, 69%, 69%, 69%, 69%, 69%, 69%, 69%, ...",4.0
8,"Samsung Galaxy S23 5G, US Version, 128GB, Phan...",245.53,3900,5.9 ounces,"Meet Galaxy S23, the phone takes you out of th...","[Let me start by saying, wow! The SAMSUNG Gala...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/SAMSUNG-Galaxy-S23-Vers...,https://m.media-amazon.com/images/I/6133nDu6wc...,Samsung,...,Snapdragon,3.36,128,6.1,1080 x 2340,120,Galaxy S23,167.26,"[67%, 67%, 67%, 67%, 67%, 67%, 67%, 67%, 67%, ...",4.2
9,"Samsung Galaxy Z Fold 4 5G US Version, 512GB, ...",390.4,4400,9.1 ounces,The Samsung Galaxy Z Fold4 is a foldable smart...,[I'm seriously loving this phone. I'd been eye...,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",https://www.amazon.com/Samsung-Galaxy-Fold-Ver...,https://m.media-amazon.com/images/I/61bnnuvIco...,Samsung,...,Snapdragon,2.75,512,7.6,1812 x 2176,120,Galaxy Z Fold 4,257.98,"[59%, 59%, 59%, 59%, 59%, 59%, 59%, 59%, 59%, ...",3.8


# **IV. DATA CLEANING**

In [85]:
# changing dtypes
df_finish2['price'] = df_finish2['price'].astype('float64')
df_finish2['battery'] = df_finish2['battery'].astype('int64')
df_finish2['ram'] = df_finish2['ram'].astype('int64')
df_finish2['cpu_speed'] = df_finish2['cpu_speed'].astype('float64')
df_finish2['screen_size'] = df_finish2['screen_size'].astype('float64')
df_finish2['refresh_rate'] = df_finish2['refresh_rate'].astype('int64')
df_finish2['weight_gr'] = df_finish2['weight_gr'].astype('float64')

# Storage
df_finish2['storage'] = df_finish2['storage'].replace(12, 128)
df_finish2['storage'] = df_finish2['storage'].astype('int64')

# Resolution
df_finish2['resolution'] = df_finish2['resolution'].str.strip(' pixels')

In [86]:
# Drop weight column
df_finish2.drop(columns='weight', inplace=True)

In [87]:
# Summary
df_finish2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                20 non-null     object 
 1   price                20 non-null     float64
 2   battery              20 non-null     int64  
 3   description          20 non-null     object 
 4   reviews              20 non-null     object 
 5   ratings              20 non-null     object 
 6   url                  20 non-null     object 
 7   img_url              20 non-null     object 
 8   brand                20 non-null     object 
 9   os                   20 non-null     object 
 10  ram                  20 non-null     int64  
 11  cpu_model            20 non-null     object 
 12  cpu_speed            20 non-null     float64
 13  storage              20 non-null     int64  
 14  screen_size          20 non-null     float64
 15  resolution           20 non-null     objec

In [88]:
# Data Explode
df_finish2 = df_finish2.explode(['reviews','ratings','rating_distribution'])

# Rating dtype
df_finish2['ratings'] = df_finish2['ratings'].astype('float64')

In [89]:
# Filter English comments only

# detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

# Apply the function
df_finish2['language'] = df_finish2['reviews'].apply(detect_language)

# Filter English only
clean_df = df_finish2[df_finish2['language']=='en']
clean_df

Unnamed: 0,title,price,battery,description,reviews,ratings,url,img_url,brand,os,...,cpu_speed,storage,screen_size,resolution,refresh_rate,model_name,weight_gr,rating_distribution,overall_rating,language
0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",I’ve used Samsung’s Ultra and Note series for ...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,"Android 16, OneUI 8",...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",This phone is worth every penny! I am upgradin...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,"Android 16, OneUI 8",...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",I've been a lifelong Samsung user. I've had th...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,"Android 16, OneUI 8",...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...","So far, the item is great and very practical. ...",5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,"Android 16, OneUI 8",...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",Switched from iOS to android with this purchas...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,"Android 16, OneUI 8",...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,"Samsung Galaxy S23 FE 5G, US Version, 128GB, B...",218.75,4500,Introducing the Samsung Galaxy S23 FE SM-S711U...,Very disappointed be the company[seller] and p...,1.0,https://www.amazon.com/Samsung-Galaxy-Version-...,https://m.media-amazon.com/images/I/51l6YHB2x4...,Samsung,Android,...,2.8,128,6.4,2340 x 1080,120,Samsung Galaxy S23 FE,209.79,9%,4.3,en
19,"Samsung Galaxy S23 FE 5G, US Version, 128GB, B...",218.75,4500,Introducing the Samsung Galaxy S23 FE SM-S711U...,"Phone was not unlocked, phone states it is own...",1.0,https://www.amazon.com/Samsung-Galaxy-Version-...,https://m.media-amazon.com/images/I/51l6YHB2x4...,Samsung,Android,...,2.8,128,6.4,2340 x 1080,120,Samsung Galaxy S23 FE,209.79,9%,4.3,en
19,"Samsung Galaxy S23 FE 5G, US Version, 128GB, B...",218.75,4500,Introducing the Samsung Galaxy S23 FE SM-S711U...,Refurbished: the phone is dirty and had stains...,1.0,https://www.amazon.com/Samsung-Galaxy-Version-...,https://m.media-amazon.com/images/I/51l6YHB2x4...,Samsung,Android,...,2.8,128,6.4,2340 x 1080,120,Samsung Galaxy S23 FE,209.79,9%,4.3,en
19,"Samsung Galaxy S23 FE 5G, US Version, 128GB, B...",218.75,4500,Introducing the Samsung Galaxy S23 FE SM-S711U...,this phone was supposed to be a verison phone ...,1.0,https://www.amazon.com/Samsung-Galaxy-Version-...,https://m.media-amazon.com/images/I/51l6YHB2x4...,Samsung,Android,...,2.8,128,6.4,2340 x 1080,120,Samsung Galaxy S23 FE,209.79,9%,4.3,en


# **V. SAVING DATAFRAME**

In [94]:
# To csv file
clean_df.to_csv('clean.csv')