In [1]:
!pip install --upgrade pandas pyarrow



In [2]:
import pandas as pd
import google.cloud.bigquery as bigquery
import json
from google.oauth2 import service_account
from google.cloud import storage

In [3]:
# Path to your service account key file - INPUT KEY HERE
key_path = "credentials/c-synomia-givaudan-140eba0163dc.json"

# Authenticate with the service account key

credentials = service_account.Credentials.from_service_account_file(key_path)
project_id = 'c-synomia-givaudan'

# Initialize a BigQuery client with the credentials
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# DATA PREPARATION

Project IDs to keep :
- Codification 1 : 485, 466, 489, 503
- Codification 2 : 487
- Codification 3 : 493

## 1. DATA LOADING AND UNDERSTANDING

### Final_table

In [4]:
client = bigquery.Client()
dataset_id = 'c-synomia-givaudan.givaudan_project_tables_processed'
table_id = 'final_table_cleaned'

query = """
SELECT *
FROM `{}`.`{}`
WHERE ProjectId IN (485, 466, 489, 503)""".format(dataset_id, table_id)

df_verbatim = client.query(query).to_dataframe()
# List of columns to drop
columns_to_drop = [
    "VerbatimId",
    "KeywordId",
    "Codif1",
    "theme_id",
    "avis",
    "age",
    "classement",
    "theme_order"
]

# Drop the specified columns
df_verbatim = df_verbatim.drop(columns=columns_to_drop)
df_verbatim.head()

Unnamed: 0,ProjectId,verbatim_content,theme_name,fragrance,emotion,keyword_value_original,keyword_value_cleaned
0,489,Feeling like I am wearing something so elegant...,01.Crush,B10,POSITIVE,love,love
1,489,Feeling like I am wearing something so elegant...,01.Crush,BZA,POSITIVE,love,love
2,489,Feeling like I am wearing something so elegant...,05.Uniqueness,B10,POSITIVE,party,party
3,489,Feeling like I am wearing something so elegant...,06.Quality,B10,POSITIVE,elegant,elegante
4,489,Feeling like I am wearing something so elegant...,06.Quality,BZA,POSITIVE,elegant,elegante


In [5]:
# Step 1: Group by 'verbatim_content' and check if duplicates exist across different ProjectIds
content_project_counts = df_verbatim.groupby('verbatim_content')['ProjectId'].nunique()

# Step 2: Identify 'verbatim_content' duplicated across different ProjectIds
cross_project_duplicates = content_project_counts[content_project_counts > 1].index

# Step 3: Filter rows where verbatim_content is in the duplicated list and keep ProjectId == 503
keep_503 = df_verbatim[(df_verbatim['verbatim_content'].isin(cross_project_duplicates)) & (df_verbatim['ProjectId'] == 503)]

# Step 4: Remove all cross-project duplicates and add back rows where ProjectId == 503
filtered_df = pd.concat([
    df_verbatim[~df_verbatim['verbatim_content'].isin(cross_project_duplicates)],  # Keep non-duplicated rows
    keep_503  # Add rows where ProjectId == 503 for cross-project duplicates
])

# Optional: Reset index for cleaner DataFrame
df_verbatim = filtered_df.reset_index(drop=True)

In [6]:
df_verbatim.theme_name.nunique()

38

In [7]:
# Preprocess theme_name to remove numbers followed by a dot
df_verbatim.theme_name = df_verbatim.theme_name.str.replace(r'\d+\.', '', regex=True)

# Initialize the JSON structure
verbatim_json = {}

# Group the DataFrame by 'verbatim_content'
grouped = df_verbatim.groupby('verbatim_content')

# Iterate through each group
for verbatim_content, group in grouped:
    emotion = group['emotion'].iloc[0]

    # Check for inconsistent emotions
    if group['emotion'].nunique() > 1:
        print(f"Emotion not consistent for verbatim_content: {verbatim_content}")

    # Check for None in keyword_value_original or theme_name
    if group['keyword_value_original'].isnull().any() or group['theme_name'].isnull().any():
        keyword_theme_mapping = {}
    else:
        # Create the keyword/theme mapping
        keyword_theme_mapping = dict(zip(group['keyword_value_original'], group['theme_name']))

    # Populate the JSON structure
    verbatim_json[verbatim_content.strip()] = {
        "emotion": emotion,
        "keyword / theme": keyword_theme_mapping,
    }

# Write the JSON structure to a file
with open('verbatim_ground_truth.json', 'w') as f:
    json.dump(verbatim_json, f)

In [8]:
evaluation_set = {k : v for i, (k, v) in enumerate(verbatim_json.items()) if i < 500}

# Finetuning Part


## Prepare data

In [9]:
training_validation_data = {k : v for i, (k, v) in enumerate(verbatim_json.items()) if i>= 500}
len(training_validation_data)

1819

In [10]:
def build_content_finetuning(verbatim, output_json):
    prompt_user = """Analyze the provided text and extract the overall emotion and key themes associated with the main keywords. Output the analysis as a JSON object. The JSON object should have two keys: "emotion" and "keyword / theme".

    The "emotion" value should be one of the following: "POSITIVE", "NEGATIVE", or "NEUTRAL".

    The "keyword / theme" value should be a JSON object where each key is a keyword extracted from the input text and each value is the associated theme for that keyword.

    Examples:

    Input: Ooh! I love this! I think it would be my favorite fragrance. It's sweet, like roses, but it has some spice in there, like cinnamon. It's well-balanced, and not too strong.
    Output: {"emotion": "POSITIVE", "keyword / theme": {"sweet": "Sweet", "spice": "Spicy/Woody/Smoky", "strong": "Intensity", "rose": "Ingredient ID - Flower", "cinnamon": "Ingredient ID - Spicy/Woody/Aro"}}

    Input: Oh wow that is really strong. I don't like this one , it smells to much of chemicals in my opinion. It is too overpowering and I feel like if I wore this that other people would make fun of me or not want to be around me.
    Output: {"emotion": "NEGATIVE", "keyword / theme": {"wow": "Crush", "fun": "Feel Good", "strong": "Intensity", "chemical": "Functional/Chemical"}}

    Input: A old lady, that's in her 60s. I think of those old picture frames my grandma had in her kitchen and they looked like they were just about to fade.
    Output: {"emotion": "NEUTRAL", "keyword / theme": {"lady": "Feminine", "old": "Old-fashioned", "old lady": "Old-fashioned", "grandma": "Old-fashioned"}}

    Input: A perfect sweet like candy, a tropical tang with melons and bright rich golden orange melons.
    Output: {"emotion": "POSITIVE", "keyword / theme": {"bright": "Feel Good", "sweet": "Sweet", "melon": "Ingredient ID - Fruit", "candy": "Ingredient ID - Food"}}

    Input:"""+verbatim+"""
    Output:"""
    return {'contents': [
              {'role': 'user',
               'parts': [{'text': prompt_user}]
               },
              {'role': 'model',
               'parts': [{'text': json.dumps(output_json)}]
               }
              ]
          }

In [11]:
from sklearn.model_selection import train_test_split

# Convert dictionary items to a list of tuples
data_items = list(training_validation_data.items())

# Split data into train and test sets
train_items, test_items = train_test_split(data_items, test_size=0.12, random_state=42)

# Convert the lists of tuples back to dictionaries
train_dict = dict(train_items)
test_dict = dict(test_items)

In [12]:
def create_jsonl_file(data_dict,output_path):
    with open(output_path, 'w') as f:
        for verbatim, json_answer in data_dict.items():
            json_data = build_content_finetuning(verbatim, json_answer)
            json.dump(json_data, f)
            f.write('\n')


create_jsonl_file(train_dict,'finetuning data/train_data_finetuning_synomia2.jsonl')
create_jsonl_file(test_dict,'finetuning data/validation_data_finetuning_synomia2.jsonl')

In [None]:
# Depot dans GCS
from google.cloud import storage
import os

location = "us-central1"
PROJECT_ID = "c-test-gen-ai-synomia"
bucket_name = "projet_givaudan"
folder_output_name = 'finetuning_data'

def upload_blob(bucket_name, file_path):
    """Uploads a file to the bucket."""
    file_name = file_path.split('/')[-1]
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    destination_blob_name = os.path.join(folder_output_name, file_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(file_path)

    print(
        f"File {file_name} uploaded to the bucket {destination_blob_name}."
    )

upload_blob(bucket_name, 'train_data_finetuning_synomia2.jsonl')
upload_blob(bucket_name, 'validation_data_finetuning_synomia2.jsonl')

File train_data_finetuning_synomia2.jsonl uploaded to the bucket finetuning_data/train_data_finetuning_synomia2.jsonl.
File validation_data_finetuning_synomia2.jsonl uploaded to the bucket finetuning_data/validation_data_finetuning_synomia2.jsonl.


## Finetune model

In [None]:
def get_model_finetuned(model_name,training_file,validation_file, display_name):
    sft_tuning_job = sft.train(
        source_model=model_name,
        train_dataset=f'gs://projet_givaudan/finetuning_data/{training_file}',
        validation_dataset=f"gs://projet_givaudan/finetuning_data/{validation_file}",
        epochs=4,
        adapter_size=4,
        learning_rate_multiplier=1.0,
        tuned_model_display_name=display_name,
    )

    # Polling for job completion
    while not sft_tuning_job.has_ended:
        time.sleep(60)
        sft_tuning_job.refresh()

    print(sft_tuning_job.tuned_model_name)
    print(sft_tuning_job.tuned_model_endpoint_name)
    print(sft_tuning_job.experiment)


tuning_job_id = "2554971831303929856"

def cancel_job(tuning_id):
    job = sft.SupervisedTuningJob(
        f"projects/{PROJECT_ID}/locations/{location}/tuningJobs/{tuning_job_id}"
    )
    job.cancel()

In [None]:
vertexai.init(project=PROJECT_ID, location="us-central1")
model_name = "gemini-1.5-flash-002"
training_file = "train_data_finetuning_synomia2.jsonl"
validation_file = "validation_data_finetuning_synomia2.jsonl"
display_name = "tuned_gemini_1_5_flash"

get_model_finetuned(model_name,
                        training_file,
                        validation_file,
                        display_name)

INFO:vertexai.tuning._tuning:Creating SupervisedTuningJob


BadRequest: 400 POST https://us-central1-aiplatform.googleapis.com/v1beta1/projects/c-test-gen-ai-synomia/locations/us-central1/tuningJobs?%24alt=json%3Benum-encoding%3Dint: Base model gemini-2.0-flash-exp is not supported.

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part, HarmCategory, HarmBlockThreshold, SafetySetting
from vertexai.tuning import sft

sft_tuning_job_gemini_flash = sft.SupervisedTuningJob('projects/120594540559/locations/us-central1/tuningJobs/8802810179260252160')
model = GenerativeModel(sft_tuning_job_gemini_flash.tuned_model_endpoint_name)