## Import libraries

In [None]:
import os
import pandas as pd

In [None]:
input_folder='./opinions'
output_filename='consolidated_opinions.csv'
all_files = os.listdir(input_folder)

df_list = []

final_columns = [
    'unified_id', 
    'original_id', 
    'product_name', 
    'sentiment', 
    'opinion', 
    'score', 
    'date', 
    'source_file'
]

column_mapping = {
    'product_name': 'product_name',
    'Product Name': 'product_name',
    'opinion_id': 'original_id',
    'Opinion ID': 'original_id',
    'id': 'original_id',
    'sentiment': 'sentiment',
    'Sentiment': 'sentiment',
    'sntiment': 'sentiment',
    'statement': 'sentiment',
    'opinion': 'opinion',
    'Opinion': 'opinion',
    'opinion(text)': 'opinion',
    'score': 'score',
    'Score': 'score',
    'date': 'date',
    'Date': 'date'
}

print(f"Found {len(all_files)} CSV files. Processing...")

for filename in all_files:

    try:
        df = pd.read_csv(f"./opinions/{filename}", sep=None)
        
        df.columns = df.columns.str.strip()
        df.rename(columns=column_mapping, inplace=True)
        
        df['source_file'] = os.path.basename(filename)
        
        if 'opinion' in df.columns:
            df['opinion'] = df['opinion'].astype(str).str.replace(r'[\r\n]+', ' ', regex=True).str.strip()
        
        # Adds missing columns with empty values to ensure consistent structure
        for col in final_columns:
            if col not in df.columns:
                df[col] = None
        
        # Filters and reorders columns to match the final structure
        df = df[final_columns]
        
        df_list.append(df)
        print(f"Successfully processed: {os.path.basename(filename)}")
        
    except Exception as e:
        print(f"Error processing {filename}: {e}")

if not df_list:
    print("No data frames to concatenate.")

# Combines all dataframes into a single dataframe
unified_df = pd.concat(df_list, ignore_index=True)

# Creates a new sequential ID to resolve overlapping original IDs
unified_df['unified_id'] = range(1, len(unified_df) + 1)

# Exports the consolidated data to a CSV file
unified_df.to_csv(output_filename, index=False)
print(f"\nSuccess! Unified data saved to: {output_filename}")
print(f"Total records: {len(unified_df)}")