In [6]:
import pandas as pd
from datetime import datetime
import json

In [7]:
clients = {}
df = pd.read_csv(
    "/Users/vintuss/practica-1/data_analysis/data _aggregation/students_data.csv")
client_ids = df.client_id.unique()
for client_id in client_ids:
    client_id = str(client_id)
    clients.update(dict({client_id: {}}))

# print(clients)

In [8]:
for client_id in client_ids:
    client_specific_df = df[df["client_id"] == client_id][[
        "period_end", "source", "source_type", "attribution", "customers"]]

    # Convert "period_end" to datetime with UTC and sort the DataFrame
    client_specific_df["period_end"] = pd.to_datetime(
        client_specific_df["period_end"], format="ISO8601", utc=True)
    client_specific_df = client_specific_df.sort_values(by="period_end")

    # Update the dictionary for the client with periods
    period_dict = {}
    periods = client_specific_df["period_end"].unique()
    for period in periods:
        period_data = {}
        period_df = client_specific_df[client_specific_df["period_end"] == period]
        # print(period_df)
        # Find the number of attributed and unattributed customers
        # Update the period data with the count of ATTRIBUTED and UNATTRIBUTED count
        # and the contribution rate for this period
        # Save the date in ISO8601 format
        period_data["period"] = period.isoformat()
        attributed_customers = int(period_df['attribution'].sum())
        period_data["attribution"] = attributed_customers
        total_customers = int(period_df['customers'].sum())
        period_data["customers"] = total_customers
        # Calculate "CR" (Contribution Rate) safely
        period_data["CR"] = (attributed_customers / total_customers) * 100
        # Find all the records for the period
        records = []
        sources = period_df['source'].unique()
        for source in sources:
            source_df = period_df[period_df['source'] == source]
            attribution_per_source = source_df["attribution"].sum()
            contribution = float(attribution_per_source/total_customers * 100)
            source_data = {
                'source': str(source),
                'source_type': str(source_df.iloc[0]['source_type']),
                'attribution': int(attribution_per_source),
                'CR': contribution,
            }
            records.append(source_data)

        period_data["records"] = records
        # Append the period data to the list for this client
        period_dict[period.strftime(
            '%Y-%m-%dT%H:%M:%S%z')] = period_data

    # Add everything to the final dictionary for this client
    clients[str(client_id)] = period_dict

In [None]:
print(clients["2074305340"])
file_path = "clients_per_day.json"
json_string = json.dumps(clients)
with open(file_path, 'w') as f:
    f.write(json_string)

{'2023-01-02T00:00:00': {'period': '2023-01-02T00:00:00', 'ATTRIBUTED': 0, 'UNATTRIBUTED': 29, 'CR': 0.0, 'records': [{'source': 'direct', 'source_type': 'link', 'ATTRIBUTED': 'False', 'customers': 28}, {'source': 'global_footer', 'source_type': 'zone', 'ATTRIBUTED': 'False', 'customers': 1}]}, '2023-01-09T00:00:00': {'period': '2023-01-09T00:00:00', 'ATTRIBUTED': 0, 'UNATTRIBUTED': 34, 'CR': 0.0, 'records': [{'source': 'direct', 'source_type': 'link', 'ATTRIBUTED': 'False', 'customers': 33}, {'source': 'global_footer', 'source_type': 'zone', 'ATTRIBUTED': 'False', 'customers': 1}]}, '2023-01-16T00:00:00': {'period': '2023-01-16T00:00:00', 'ATTRIBUTED': 0, 'UNATTRIBUTED': 31, 'CR': 0.0, 'records': [{'source': 'direct', 'source_type': 'link', 'ATTRIBUTED': 'False', 'customers': 30}, {'source': 'advocate_welcome_email', 'source_type': 'link', 'ATTRIBUTED': 'False', 'customers': 1}]}, '2023-01-23T00:00:00': {'period': '2023-01-23T00:00:00', 'ATTRIBUTED': 0, 'UNATTRIBUTED': 40, 'CR': 0.0, 