# 📊 Real Estate Data Consolidation and GCS Upload Script

"""
Purpose:
This script automates the process of consolidating large real estate data chunks from GCS,
performing feature engineering, and uploading the processed file back to GCS.

Key Steps:
1. Downloading cleaned data chunks from GCS.
2. Merging chunks into a single consolidated DataFrame.
3. Feature Engineering:
   - Extracting Year, Month, Quarter from Date_of_Transfer.
   - Calculating Region Code from Postcode.
4. Saving the merged data to a local file.
5. Uploading the consolidated file to GCS.

Expected Output:
- A consolidated CSV file containing merged and enhanced data uploaded to GCS.

Requirements:
- Google Cloud SDK installed and authenticated.
- Python libraries: pandas, google-cloud-storage

Author: Nguyen Minh Tri
Date: April 2025
"""


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
import pandas as pd
import os
from google.cloud import storage

# Step 1: Download data from GCS
bucket_name = "boothill2001-dataset"
gcs_path = "uk_property_data/processed/"
local_path = "data/processed/"

client = storage.Client()
bucket = client.bucket(bucket_name)
os.makedirs(local_path, exist_ok=True)

# Download all chunk files from GCS
def download_chunks():
    blobs = bucket.list_blobs(prefix=gcs_path)
    for blob in blobs:
        if blob.name.endswith('.csv'):
            local_file_path = os.path.join(local_path, os.path.basename(blob.name))
            blob.download_to_filename(local_file_path)
            print(f"Downloaded {blob.name} to {local_file_path}")

download_chunks()
print("All data chunks downloaded successfully!")

# Step 2: Merge Data Chunks
chunk_files = sorted([f for f in os.listdir(local_path) if f.startswith('cleaned_real_estate_part_')])
combined_df = pd.DataFrame()

print(f"Found {len(chunk_files)} chunks. Merging...")

for file in chunk_files:
    file_path = os.path.join(local_path, file)
    chunk_df = pd.read_csv(file_path, low_memory=False)
    combined_df = pd.concat([combined_df, chunk_df], ignore_index=True)

print(f"Data merged successfully! Shape: {combined_df.shape}")

# Step 3: Feature Engineering
combined_df['Year'] = pd.to_datetime(combined_df['Date_of_Transfer']).dt.year
combined_df['Month'] = pd.to_datetime(combined_df['Date_of_Transfer']).dt.month
combined_df['Quarter'] = pd.to_datetime(combined_df['Date_of_Transfer']).dt.quarter
combined_df['Region_Code'] = combined_df['Postcode'].str.extract(r'([A-Z]+)')

# Save the consolidated and processed file
output_path = 'data/processed/cleaned_real_estate_combined.csv'
combined_df.to_csv(output_path, index=False)
print(f"Consolidated data saved at {output_path}!")

# Step 4: Upload processed file to GCS
def upload_to_gcs(local_file_path, gcs_file_path):
    try:
        blob = bucket.blob(gcs_file_path)
        blob.upload_from_filename(local_file_path)
        print(f"Uploaded {local_file_path} to GCS as {gcs_file_path}")
    except Exception as e:
        print(f"Error uploading {local_file_path}: {e}")

upload_to_gcs(output_path, gcs_path + 'cleaned_real_estate_combined.csv')
print("All processed files uploaded successfully!")


Downloaded uk_property_data/processed/cleaned_real_estate_part_1.csv to data/processed/cleaned_real_estate_part_1.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_10.csv to data/processed/cleaned_real_estate_part_10.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_11.csv to data/processed/cleaned_real_estate_part_11.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_12.csv to data/processed/cleaned_real_estate_part_12.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_13.csv to data/processed/cleaned_real_estate_part_13.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_14.csv to data/processed/cleaned_real_estate_part_14.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_15.csv to data/processed/cleaned_real_estate_part_15.csv
Downloaded uk_property_data/processed/cleaned_real_estate_part_16.csv to data/processed/cleaned_real_estate_part_16.csv
Downloaded uk_property_data/processed/clea