### Importing the data

In [1]:
import pandas as pd
df = pd.read_csv('/Users/masa/Desktop/assessment-rag/SPOTIFY_REVIEWS.csv')
# Data cleaning process

# Drop the index column 'Unnamed: 0' as it's not needed
cleaned_data = df.drop(columns=['Unnamed: 0'])

# Basic cleaning of the review_text by:
# - Lowercasing all text for consistency
# - Removing leading/trailing spaces (if any)
cleaned_data['review_text'] = cleaned_data['review_text'].str.lower().str.strip()

# Remove rows with missing or null values in the critical columns
cleaned_data = cleaned_data.dropna(subset=['review_text', 'review_rating'])

cleaned_data.head()


Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp
0,14a011a8-7544-47b4-8480-c502af0ac26f,152618553977019693742,A Google user,use it every day,5,1,1.1.0.91,2014-05-27 14:21:48
1,bfa8876b-470e-4640-83a7-77427f7f37e8,234382942865437071667,A Google user,"i enjoy the awesome ui of this app, and it has...",5,4,1.1.0.91,2014-05-27 14:36:02
2,70e8252f-058a-47d9-b066-df9e1571c970,174473604608358796368,A Google user,love it! especially the new design!,5,2,1.1.0.91,2014-05-27 14:40:01
3,672a155a-e81e-4d28-bdeb-a74c031bc072,286593453219054880269,A Google user,"awesome ui, best music app out there!",5,1,1.1.0.91,2014-05-27 15:17:20
4,bbc1bf95-ed36-41a1-8b98-0f2e314caea5,167276875678680630145,A Google user,as a professional android developer i'm glad t...,5,10,1.1.0.91,2014-05-27 15:26:48


#### Creating a new column with metadata - will be used for generating embeddings

In [2]:
cleaned_data['review_text_with_metadata'] = cleaned_data.apply(
    lambda row: f"User_review_text: {row['review_text']} | Rating_by_user: {row['review_rating']} | Review_likes_received_by_other_users: {row['review_likes']} | App_Version: {row['author_app_version']} | Timestamp_review_posted: {row['review_timestamp']}",
    axis=1
)

In [3]:
cleaned_data['review_text_with_metadata'].head()

0    User_review_text: use it every day | Rating_by...
1    User_review_text: i enjoy the awesome ui of th...
2    User_review_text: love it! especially the new ...
3    User_review_text: awesome ui, best music app o...
4    User_review_text: as a professional android de...
Name: review_text_with_metadata, dtype: object

### Saving the cleaned data to a new CSV file

In [4]:
# Save the cleaned data to a new CSV file
cleaned_data.to_csv('/Users/masa/Desktop/assessment-rag/CLEANED_SPOTIFY_REVIEWS_v2.csv', index=False)

In [5]:
# Count the number of rows in the cleaned dataset
row_count = cleaned_data.shape[0]
print(f"Number of rows in the cleaned dataset: {row_count}")


Number of rows in the cleaned dataset: 3376401
