In [None]:
import os

In [None]:
print("Current Working Directory:", os.getcwd())
# Set the environment variable in Python
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../../../secrets/ac215-privasee-datapipeline.json"
# Initialize the GCS client
storage_client = storage.Client()

In [None]:

from typing import Dict


In [None]:
def upload_df_to_gcs(bucket_name, df, destination_blob_name):
    """Uploads a DataFrame as a CSV to GCS directly from memory."""
    # Convert DataFrame to CSV string
    csv_data = df.to_csv(index=False)

    # Get the bucket and blob objects
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the CSV string to GCS
    blob.upload_from_string(csv_data, content_type='text/csv')

    print(f"Uploaded DataFrame to {destination_blob_name} in bucket {bucket_name}.")

def read_csv_from_gcs(bucket_name, source_blob_name):
    """Read a CSV file from GCS into a DataFrame."""
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    # Download the CSV content as text (UTF-8)
    content = blob.download_as_text()
    # Read it directly into a DataFrame
    return pd.read_csv(io.StringIO(content))
def save_weights_to_csv(category_weights: Dict[str, float], filepath: str):
    """Save category weights to a simple CSV file."""
    df = pd.DataFrame({
        'parent_category': category_weights.keys(),
        'weight': category_weights.values()
    })
    df.to_csv(filepath, index=False)

def load_weights_from_csv(filepath: str) -> Dict[str, float]:
    """Load category weights from CSV file into format needed by grader."""
    df = pd.read_csv(filepath)
    return dict(zip(df['parent_category'], df['weight']))

# CREATE REFERENCE MAP DF

In [None]:
import pandas as pd
from typing import List, Dict


In [None]:
# List all buckets in your project to confirm the client works
buckets = list(storage_client.list_buckets())

for bucket in buckets:
    print(bucket.name)

bucket_name = 'legal-terms-data'
source_blob_name = 'tosdr-data/clean/cleaned_output2.csv'
df = read_csv_from_gcs(bucket_name, source_blob_name)

In [None]:
df.head()

## mapping_Df

In [None]:
mapping_df=df[["parent_privacy_issue","privacy_issue_clean"]].drop_duplicates()
mapping_df.reset_index(inplace= True, drop=True)
mapping_df.rename(columns={'privacy_issue_clean': 'privacy_issue',
                           'parent_privacy_issue':'parent_issue'
                           }, inplace=True)

In [None]:
mapping_df.head()

### save it to gcp

In [None]:
mapping_df.to_csv("/Users/gumutoni/PycharmProjects/ac215/AC215_PrivaSEE/src/models/mapping_df.csv")

## category weights

In [None]:
# Create cateogry weights
# Define weights by importance tiers (1.0 = highest importance)
category_weights = {
    # Tier 1 (1.0) - Core Privacy and Data Rights
    'Personal Data': 1.0,
    'Types of Information Collected': 1.0,
    'Security': 1.0,
    'Third Parties': 1.0,
    'Anonymity': 1.0,
    
    # Tier 2 (0.9) - User Control and Rights
    'User Choice': 0.9,
    'Right to Leave The Service': 0.9,
    'Ownership': 0.9,
    'Transparency': 0.9,
    'Copyright License': 0.9,
    
    # Tier 3 (0.8) - Data Handling and Tracking
    'Trackers': 0.8,
    'Logs': 0.8,
    'Business Transfers': 0.8,
    'Advertising': 0.8,
    
    # Tier 4 (0.7) - Service Governance
    'Law and Government Requests': 0.7,
    'Jurisdiction and governing laws': 0.7,
    'Dispute Resolution': 0.7,
    'Suspension and Censorship': 0.7,
    
    # Tier 5 (0.6) - Terms and Changes
    'Changes': 0.6,
    'Notice of Changing Terms': 0.6,
    'User Involvement in Changing Terms': 0.6,
    'Governance': 0.6,
    
    # Tier 6 (0.5) - Other Categories
    'Content': 0.5,
    'Payments': 0.5,
    'Guarantee': 0.5,
    
    # Tier 7 (0.3) - Misc/Special Cases
    'Unclassified': 0.3,
    '[Deprecated]': 0.3
}



In [None]:
# Validation to ensure all categories are covered
required_categories = [
    'Ownership', 'Governance', 'Guarantee', 'User Involvement in Changing Terms',
    'Changes', 'Unclassified', 'Jurisdiction and governing laws',
    'Notice of Changing Terms', 'Suspension and Censorship', 'Business Transfers',
    'Logs', 'Security', 'Payments', 'Content', 'Copyright License', 'Trackers',
    'Personal Data', '[Deprecated]', 'Anonymity', 'Types of Information Collected',
    'User Choice', 'Third Parties', 'Right to Leave The Service',
    'Law and Government Requests', 'Advertising', 'Transparency', 'Dispute Resolution'
]

# # Check if all categories are covered
# missing_categories = set(required_categories) - set(category_weights.keys())
# if missing_categories:
#     print(f"Warning: Missing weights for categories: {missing_categories}")
# 
# # Print weights by tier for verification
# for weight in sorted(set(category_weights.values()), reverse=True):
#     print(f"\nTier (Weight: {weight}):")
#     categories = [cat for cat, w in category_weights.items() if w == weight]
#     for cat in sorted(categories):
#         print(f"- {cat}")

In [None]:


save_weights_to_csv(category_weights, '/Users/gumutoni/PycharmProjects/ac215/AC215_PrivaSEE/src/models/category_weights.csv')

# Load weights for grader
category_weights = load_weights_from_csv('category_weights.csv')
