# Reviews combination

This notebook allows you to combine all the reviews present in the various useful datasets identified to create a single dataset of reviews that follows the following structure:
- id_review
- review
- original_dataset
- original_id
- service_type

### Preprocessing

Preprocessing is necessary on the European hotelreview and booking datasets because the reviews are separated into two categories: positive and negative. In our initial approach, we do not need to segment the reviews into two groups. The decision has therefore been made to combine the content of these two cells into one.
To make the pipeline easier to read, the two new dataframes thus created will be saved in CSV format.

In [None]:
import polars as pl

df = pl.read_csv("../data/processed/dataset/data_booking.csv")

# Merge the review_positive and review_negative columns
df = df.with_columns(
    (pl.col("review_positive") + " " + pl.col("review_negative")).alias("review")
)

# Remove the original columns
df = df.drop(["review_positive", "review_negative"])

print(df.head())

# Save in CSV
df.write_csv("../data/processed/data_booking.csv")


In [None]:
import polars as pl

df = pl.read_csv("../data/processed/dataset/data_european_hotel_reviews.csv")

# Merge the Negative_Review and Positive_Review columns
df = df.with_columns(
    (pl.col("Negative_Review") + " " + pl.col("Positive_Review")).alias("review")
)

# Remove the original columns
df = df.drop(["Negative_Review", "Positive_Review"])

print(df.head())

# Save in CSV
df.write_csv("../data/processed/data_european_hotel_reviews.csv")


## Pipeline

In [None]:
import polars as pl

def load_dataset(path: str, review_col: str, id_col: str, dataset_name: str, service_type: str) -> pl.DataFrame:
    """
    Loads a dataset and formats it in memory in standard format:
    - review: comment text
    - original_dataset: name of the original dataset
    - original_id: original identifier
    - service_type: type of service (e.g., "accommodation", "restauration", etc.)
    """
    # Read
    df = pl.read_csv(path, encoding="latin1", ignore_errors=True)

    # Conversion to target format
    df = (
        df
        .rename({review_col: "review", id_col: "original_id"})
        .select(["review", "original_id"])
        .with_columns([
            pl.lit(dataset_name).alias("original_dataset"),
            pl.lit(service_type).alias("service_type")
        ])
        .select(["review", "original_dataset", "original_id", "service_type"])
    )

    return df


In [None]:
# Load multiple datasets into memory
dfs = [
    load_dataset("../data/processed/dataset/data_activities_reviews.csv", "Text", "id", "data_activities_reviews", "leisure"),
    load_dataset("../data/processed/dataset/data_airline_reviews_1.csv", "ReviewBody", "id", "data_airline_reviews_1", "transportation"),
    load_dataset("../data/processed/dataset/data_airline_reviews_2.csv", "Reviews", "id", "data_airline_reviews_2", "transportation"),
    load_dataset("../data/processed/dataset/data_european_restaurant_reviews.csv", "Review", "id", "data_european_restaurant_reviews", "restauration"),
    load_dataset("../data/processed/dataset/data_hotel_reviews_1.csv", "reviews.text", "id", "data_hotel_reviews_1", "accommodation"),
    load_dataset("../data/processed/dataset/data_hotel_reviews_2.csv", "reviews.text", "id", "data_hotel_reviews_2", "accommodation"),
    load_dataset("../data/processed/dataset/data_hotel_reviews_3.csv", "reviews.text", "id", "data_hotel_reviews_3", "accommodation"),
    load_dataset("../data/processed/dataset/data_restaurant_reviews_1.csv", "Review", "id", "data_restaurant_reviews_1", "restauration"),
    load_dataset("../data/processed/dataset/data_restaurant_reviews_2.csv", "Review", "id", "data_restaurant_reviews_2", "restauration"),
    load_dataset("../data/processed/dataset/data_tripadvisor_hotel_reviews.csv", "text", "id", "data_tripadvisor_hotel_reviews", "accommodation"),
    load_dataset("../data/processed/dataset/data_twitter.csv", "Text", "id", "data_twitter", "undetermined"),
    load_dataset("../data/processed/data_booking.csv", "review", "id", "data_booking", "accommodation"),
    load_dataset("../data/processed/data_european_hotel_reviews.csv", "review", "id", "data_european_hotel_reviews", "accommodation")
]


In [None]:
# Concatenating all DataFrames
df_all = pl.concat(dfs, rechunk=True)

# Adding an artificial id
id_column = pl.Series("id_review", range(df_all.height))  
df_all = pl.concat([id_column.to_frame(), df_all], how="horizontal")

# Manage Yelp
#df_yelp = load_dataset("../data/processed/dataset/data_yelp_reviews.csv", "text", "id", "data_yelp_reviews", "accommodation")

#df_all = pl.concat(df_yelp, rechunk=True)
# Sauvegarde du dataset global
df_all.write_csv("../data/processed/all_reviews.csv")

print("The global dataset has been created: data/processed/all_reviews.csv")
print(df_all.shape)
print(df_all.head())

In [None]:
# Attempt

# Yelp processing
import polars as pl
df = pl.read_ndjson('../data/original/yelp_dataset/yelp_academic_dataset_review.json')
print(df.head())

df_yelp = (
    df
    .rename({"text": "review", "review_id": "original_id"}).select(["review", "original_id"])
    .with_columns([
        pl.lit("data_yelp_reviews").alias("original_dataset"),
        pl.lit("accommodation").alias("service_type")
    ])
    .select(["review", "original_dataset", "original_id", "service_type"])
)

id_column = pl.Series("id_review", range(3363667, 3363667 + df_yelp.height)) 
df_yelp = pl.concat([id_column.to_frame(), df_yelp], how="horizontal")

print(df_yelp.head())
df_yelp.write_csv("../data/processed/all_reviews_yelp.csv")



#df_all = pl.concat([df, df_yelp], rechunk=True)
#df_all.write_csv("../data/processed/all_reviews_new.csv")

### Concatenate the two CSV files in streaming mode to avoid memory overload.

In [None]:
import polars as pl

# Read in streaming mode
df1 = pl.scan_csv("../data/processed/all_reviews.csv")
df2 = pl.scan_csv("../data/processed/all_reviews_yelp.csv")

df1 = df1.with_columns(pl.col("original_id").cast(pl.Utf8))
df2 = df2.with_columns(pl.col("original_id").cast(pl.Utf8))

# Lazy concatenation
merged = pl.concat([df1, df2])

# Writing to a new file, still streaming
merged.sink_csv("../data/processed/all_merge_reviews.csv")
