In [177]:
import yaml

# reading the input file
input_file = './input.yaml'
with open(input_file, 'r') as file:
    data = yaml.safe_load(file)
print(data)

{'brand_website': 'https://www.myprotein.com/', 'brand_competition_website': 'https://www.optimumnutrition.com/en-us', 'service_locations': ['United States'], 'ad_budgets': [{'shopping_ads': None}, {'performance_max_ads': None}, {'search_ads': None}]}


In [178]:
import pandas as pd

# get the keywords file, save them as csv and return the dataframe
def get_keywords_dataframe(brand_website, service_locations ):
    # currently manually scraped them and returning the DataFrame

    if brand_website == 'https://www.myprotein.com/':
        return pd.read_csv('./myprotein.csv')
    elif brand_website == 'https://www.optimumnutrition.com/en-us':
        return pd.read_csv('./optimum_nutrition.csv')

# querying to get keywords, and returning them as Pandas DataFrames
brand_keywords_df = get_keywords_dataframe(data['brand_website'], data['service_locations'])
brand_competitors_keywords_df = get_keywords_dataframe(data['brand_competition_website'], data['service_locations'])



In [179]:
print("Brand")
print(brand_keywords_df.describe())
print(brand_competitors_keywords_df.head())

Brand
       Competition Index  Search Volume  Low Top Page Bid  High Top Page Bid
count         500.000000   5.000000e+02        482.000000         482.000000
mean           21.752000   1.634705e+05          0.859938           5.177116
std            17.815985   2.496472e+06          3.263679           9.074338
min             0.000000   1.100000e+02          0.010000           0.170000
25%             6.000000   2.100000e+02          0.090000           1.437500
50%            20.000000   7.200000e+02          0.380000           2.440000
75%            31.000000   4.400000e+03          0.940000           4.877500
max            75.000000   5.560000e+07         67.200000          73.400000
                   Keyword Text Competition  Competition Index  Search Volume  \
0                   frontier ai         LOW                 21        2240000   
1  open artificial intelligence         LOW                 15        1220000   
2       artificial intelligence         LOW               

In [180]:
print("Brand Competition")
print(brand_competitors_keywords_df.describe())
print(brand_keywords_df.head())

Brand Competition
       Competition Index  Search Volume  Low Top Page Bid  High Top Page Bid
count         500.000000   5.000000e+02        497.000000         497.000000
mean           32.780000   2.205524e+04          1.590221           8.778531
std            19.263483   1.347368e+05          1.315620           7.314264
min             0.000000   2.100000e+02          0.030000           0.300000
25%            18.000000   4.800000e+02          0.660000           4.270000
50%            32.000000   1.300000e+03          1.220000           7.070000
75%            46.000000   4.400000e+03          2.160000          10.510000
max            76.000000   2.240000e+06          9.960000          57.330000
                   Keyword Text Competition  Competition Index  Search Volume  \
0                       chatgpt         LOW                 28       55600000   
1                            ai      MEDIUM                 56        2240000   
2                          ai's      MEDIUM   

In [181]:
# merge the DataFrames
combined_df = pd.concat([brand_keywords_df, brand_competitors_keywords_df], ignore_index=True)
# drop duplicates
combined_df.drop_duplicates(inplace=True)
print(combined_df.describe())
print(combined_df.head())

       Competition Index  Search Volume  Low Top Page Bid  High Top Page Bid
count         975.000000   9.750000e+02        954.000000          954.00000
mean           26.731282   9.345890e+04          1.238711            7.04499
std            19.130364   1.790515e+06          2.530045            8.50956
min             0.000000   1.100000e+02          0.010000            0.17000
25%            10.000000   3.200000e+02          0.250000            1.98250
50%            24.000000   8.800000e+02          0.770000            4.44500
75%            39.000000   4.400000e+03          1.630000            8.83750
max            76.000000   5.560000e+07         67.200000           73.40000
                   Keyword Text Competition  Competition Index  Search Volume  \
0                       chatgpt         LOW                 28       55600000   
1                            ai      MEDIUM                 56        2240000   
2                          ai's      MEDIUM                 56  

In [182]:
# filter out less than search volume threshold
search_volume_threshold = 500
filtered_df = combined_df[combined_df['Search Volume'] > search_volume_threshold]
print(filtered_df.describe())
print(filtered_df.head())

       Competition Index  Search Volume  Low Top Page Bid  High Top Page Bid
count         610.000000   6.100000e+02        604.000000         604.000000
mean           27.632787   1.492182e+05          1.167401           6.529967
std            19.040263   2.262540e+06          2.912297           7.582402
min             0.000000   5.900000e+02          0.010000           0.280000
25%            14.000000   1.000000e+03          0.210000           1.720000
50%            25.000000   2.400000e+03          0.725000           4.265000
75%            39.000000   9.900000e+03          1.520000           8.710000
max            75.000000   5.560000e+07         67.200000          67.200000
                   Keyword Text Competition  Competition Index  Search Volume  \
0                       chatgpt         LOW                 28       55600000   
1                            ai      MEDIUM                 56        2240000   
2                          ai's      MEDIUM                 56  

In [183]:
# setting up the model
import google.generativeai as genai
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Configure the API key from the .env file
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

# Model setup using the correct class
model = genai.GenerativeModel("gemini-1.5-flash")

### Search Themes for Performance Max Campaign

In [184]:
def get_prompt(keyword_list_str):
    """
    Generates a prompt for the Gemini model to classify keywords into asset group themes.

    Args:
        keyword_list_str (str): A formatted string of keywords with their attributes.

    Returns:
        str: The complete prompt for the Gemini model.
    """
    prompt = """
<CONTEXT>
You are an expert Marketing Strategist and Ad Campaign Manager. Your primary skill is analyzing raw lists of keywords and their performance metrics to discern underlying patterns and organize them into highly effective, prioritized, and thematic ad groups.
</CONTEXT>

<INSTRUCTIONS>
Your task is to analyze the provided list of keywords and their associated data. You will then classify them into broad, logical themes for an ad campaign, prioritizing based on performance potential. Your entire output MUST be a single, valid JSON object enclosed in a ```json ... ``` code block.

---

### **Guiding Principles**

1.  **Theme Categories:** Themes can be based on product types, use cases, demographics, or events. These are just **suggestions**. You have the creative freedom to **invent new categories** that better fit the keywords.
2.  **Rationale is Key:** For each theme, provide a one-sentence explanation for your choice, referencing the data to support your logic.
3.  **Data-Driven Prioritization:** Use the metrics to find the **best value**. The ideal "sweet spot" is a combination of **high search volume with manageable (low to medium) competition and bid costs**. Prioritize themes built around these high-potential keywords.
4.  **Keyword Curation:** For each theme, you must select the **top keywords** that best represent the entire category. Your selection should not exceed **10 keywords**. Aim for a mix that includes both high-volume "head" terms and specific, high-intent "long-tail" terms to capture the theme's full scope.

---

### **Input & Output Format**

**Input Data Format:**
The keyword data will be provided as a multi-line string. Each line follows this exact format:
`- Keyword: [Keyword Text], Search Volume: [Volume], Competition: [Level], Bid Amount: [Amount]`

**JSON Output Structure:**
You must provide your response as a single JSON object. This object must have two top-level keys: `analysis` and `themed_keywords`.

1.  **`analysis` key:** The value must be an object containing your strategic thought process with the following three keys:
    * `high_value_keyword_identification`: A string explaining which keywords stand out and why, based on their balance of search volume, competition, and cost.
    * `pattern_recognition_and_theme_ideation`: A string describing the patterns and connections you found and the potential themes you brainstormed.
    * `final_strategy_and_grouping_logic`: A string defining the final, solidified logic you will use to group the keywords into themes.

2.  **`themed_keywords` key:** The value must be an array `[]` of theme objects. Each theme object must have the following three keys:
    * `theme_name`: A string for the name of the theme.
    * `rationale`: A string containing the one-sentence rationale for the theme.
    * **`keywords`**: An array `[]` of strings, where each string is the `keyword_text` of a keyword belonging to that theme.


</INSTRUCTIONS>

<EXAMPLES>
Here are three examples showing how to apply the principles and structure the final JSON output correctly.

**--- EXAMPLE 1: RUNNING SHOES ---**

**Keywords to Classify:**
- Keyword: best running shoes for men, Search Volume: 22000, Competition: High, Bid Amount: 2.1
- Keyword: women's trail running shoes, Search Volume: 8500, Competition: Medium, Bid Amount: 1.8
- Keyword: lightweight marathon sneakers, Search Volume: 9000, Competition: Medium, Bid Amount: 2.5
- Keyword: waterproof hiking sneakers, Search Volume: 4500, Competition: Low, Bid Amount: 1.2
- Keyword: daily trainer running shoe, Search Volume: 15000, Competition: High, Bid Amount: 1.9
- Keyword: cushioned running shoes for flat feet, Search Volume: 5000, Competition: Low, Bid Amount: 2.3
- Keyword: men's trail runners, Search Volume: 7000, Competition: Medium, Bid Amount: 1.75
- Keyword: long distance running shoes, Search Volume: 11000, Competition: Medium, Bid Amount: 2.4

**Required JSON Output:**
```json
{{
  "analysis": {{
    "high_value_keyword_identification": "The most promising keywords are 'long distance running shoes' and 'lightweight marathon sneakers' because they have strong search volume with manageable Medium competition. 'waterproof hiking sneakers' and 'cushioned running shoes for flat feet' are also valuable as low-competition niches.",
    "pattern_recognition_and_theme_ideation": "I see a clear pattern around use cases. 'Marathon,' 'long distance,' and 'trail' keywords point to specific activities, suggesting themes like 'Performance & Racing' and 'Trail & Outdoor.' The keyword 'cushioned running shoes for flat feet' points to a support-based theme.",
    "final_strategy_and_grouping_logic": "My final strategy is to create three themes. 'Performance & Racing' will group keywords for competitive running. 'Trail & Outdoor' will group keywords for off-road activities. 'General & Support Focused' will be a broader theme to capture high-volume generic terms and specific support-related searches."
  }},
  "themed_keywords": [
    {{
      "theme_name": "Performance & Racing",
      "rationale": "This theme groups high-intent keywords for competitive runners, anchored by strong search volume and medium competition.",
      "keywords": [
        "lightweight marathon sneakers",
        "long distance running shoes"
      ]
    }},
    {{
      "theme_name": "Trail & Outdoor",
      "rationale": "This theme consolidates keywords for off-road use, representing a solid opportunity with moderate volume and competition.",
      "keywords": [
       "women's trail running shoes",
        "men's trail runners",
        "waterproof hiking sneakers"
      ]
    }},
    {{
      "theme_name": "General & Support Focused",
      "rationale": "This theme includes broad, high-volume terms alongside a specific, low-competition keyword to capture a wide audience.",
      "keywords": [
        "best running shoes for men", 
        "daily trainer running shoe",
        "cushioned running shoes for flat feet"
      ]
    }}
  ]
}}
--- EXAMPLE 2: COFFEE ---

**Keywords to Classify:**
- Keyword: gourmet coffee subscription box, Search Volume: 9500, Competition: Medium, Bid Amount: 3.1
- Keyword: dark roast coffee beans, Search Volume: 18000, Competition: High, Bid Amount: 1.5
- Keyword: single origin ethiopian coffee, Search Volume: 4000, Competition: Low, Bid Amount: 2.2
- Keyword: coffee gift set for christmas, Search Volume: 8000, Competition: Low, Bid Amount: 2.5
- Keyword: monthly coffee delivery, Search Volume: 12000, Competition: Medium, Bid Amount: 2.9
- Keyword: espresso whole beans, Search Volume: 14000, Competition: Medium, Bid Amount: 1.8
- Keyword: coffee lovers gift basket, Search Volume: 6500, Competition: Low, Bid Amount: 2.4

**Required JSON Output:**
```json
{{
  "analysis": {{
    "high_value_keyword_identification": "The keywords that immediately stand out are 'monthly coffee delivery' (high volume, medium competition) and the two gifting keywords: 'coffee gift set for christmas' and 'coffee lovers gift basket,' both of which have excellent volume for Low competition terms. 'single origin ethiopian coffee' is another gem due to its low competition and specific intent.",
    "pattern_recognition_and_theme_ideation": "The user intent falls into three buckets: users looking for a recurring service ('subscription', 'monthly delivery'), users looking for gifts ('gift set', 'gift basket'), and users looking for specific coffee products ('dark roast', 'espresso'). This leads to theme ideas like 'Subscription Service', 'Gifting', and 'Bean & Roast Types'.",
    "final_strategy_and_grouping_logic": "The strategy is to create three themes based on these user intents. 'Subscription Service' will capture users with high lifetime value. 'Gifting & Special Occasions' will target seasonal shoppers. 'Specific Bean & Roast Types' will group all product-specific queries."
  }},
  "themed_keywords": [
    {{
      "theme_name": "Subscription Service",
      "rationale": "This theme targets high-intent users looking for recurring delivery, representing a great balance of volume and competition.",
      "keywords": [
        "monthly coffee delivery",
        "gourmet coffee subscription box"
      ]
    }},
    {{
      "theme_name": "Gifting & Special Occasions",
      "rationale": "This theme focuses on valuable, low-competition keywords with clear transactional intent for seasonal and gift-giving moments.",
      "keywords": [
        "coffee gift set for christmas",
        "coffee lovers gift basket"
      ]
    }},
    {{
      "theme_name": "Specific Bean & Roast Types",
      "rationale": "This theme groups product-specific searches, from broad, high-volume terms to niche, low-competition keywords.",
      "keywords": [
        "dark roast coffee beans",
        "espresso whole beans",
        "single origin ethiopian coffee"
      ]
    }}
  ]
}}

--- EXAMPLE 3: PROTEIN POWDER ---

**Keywords to Classify:**
- Keyword: vegan protein powder, Search Volume: 35000, Competition: Medium, Bid Amount: 2.8
- Keyword: best tasting whey protein, Search Volume: 45000, Competition: High, Bid Amount: 3.5
- Keyword: post-workout recovery shake, Search Volume: 25000, Competition: High, Bid Amount: 2.5
- Keyword: protein for women's fitness, Search Volume: 18000, Competition: Medium, Bid Amount: 3.1
- Keyword: meal replacement shake for weight loss, Search Volume: 22000, Competition: Medium, Bid Amount: 2.9
- Keyword: casein protein before bed, Search Volume: 9000, Competition: Low, Bid Amount: 2.2
- Keyword: organic plant-based protein, Search Volume: 15000, Competition: Low, Bid Amount: 3.0

**Required JSON Output:**
```json
{{
  "analysis": {{
    "high_value_keyword_identification": "The clear winners are 'organic plant-based protein' and 'casein protein before bed.' Both have fantastic search volume (15k and 9k) for Low competition keywords, making them highly efficient targets. 'vegan protein powder' and 'meal replacement shake' are also very strong, with high volume and manageable Medium competition.",
    "pattern_recognition_and_theme_ideation": "Patterns emerge based on both product type and user goal. 'Vegan' and 'plant-based' form a clear dietary group. 'Whey' and 'casein' are specific protein types. 'Post-workout,' 'weight loss,' and 'women's fitness' are all goal-oriented. This suggests themes like 'Plant-Based Protein,' 'Dairy-Based Proteins,' and 'Goal-Oriented Nutrition.'",
    "final_strategy_and_grouping_logic": "My final plan is to create three themes. 'Plant-Based & Organic' will group keywords for that specific dietary preference. 'Dairy Proteins & Recovery' will combine the different types of dairy-based protein. 'Fitness & Weight Management Goals' will target users searching for solutions to specific health objectives."
  }},
  "themed_keywords": [
    {{
      "theme_name": "Plant-Based & Organic",
      "rationale": "This theme targets a large and growing demographic, anchored by a high-volume, low-competition keyword.",
      "keywords": [
        "organic plant-based protein",
       "vegan protein powder"
       ]
    }},
    {{
      "theme_name": "Dairy Proteins & Recovery",
      "rationale": "This theme groups keywords by protein type (whey, casein) and captures the high-volume post-workout use case.",
      "keywords": [
        "best tasting whey protein",
        "post-workout recovery shake",
        "casein protein before bed"
      ]
    }},
    {{
      "theme_name": "Fitness & Weight Management Goals",
      "rationale": "This theme targets users with specific, high-intent fitness goals, representing a motivated audience segment.",
      "keywords": [
        "meal replacement shake for weight loss",
        "protein for women's fitness"
      ]
    }}
  ]
}}

</EXAMPLES>

<TASK>
Now, apply this process to the following keywords. Remember to provide the entire response as a single, valid JSON object and nothing else.

**Keywords to classify:**
{}
</TASK>
""".format(keyword_list_str)
    return prompt


In [185]:
import numpy as np

import re

from yarg import get # Recommended for parsing the model's JSON output

def safe_float(val):
    """Safely convert a value to a float, returning 0.0 on failure."""
    try:
        return float(val)
    except (ValueError, TypeError):
        return 0.0
    

def classify_keywords_to_asset_group_themes(keyword_data, model):
    """
    Classifies a list of keywords into broad asset group themes using the Gemini API.

    Args:
        keyword_data (list of dict): A list where each dict contains keyword, search_volume,
                                      competition, and bid_amount data.
        model: An initialized Gemini model instance.

    Returns:
        str: The Gemini model's raw text response containing the classified keywords in JSON format.
    """

    # Format the keyword data into a readable string for the model
    formatted_lines = []
    for item in keyword_data:
        keyword = item.get('Keyword Text', '')
        volume = int(item.get('Search Volume', 0))
        competition = str(item.get('Competition', '')).upper()
        low_bid = safe_float(item.get('Low Top Page Bid'))
        high_bid = safe_float(item.get('High Top Page Bid'))
        avg_bid = round(np.mean([low_bid, high_bid]), 2)
        
        line = f"- Keyword: {keyword}, Search Volume: {volume}, Competition: {competition}, Bid Amount: {avg_bid}"
        formatted_lines.append(line)

    keyword_list_str = "\n".join(formatted_lines)
    
 
    prompt = get_prompt(keyword_list_str)
    generation_config = {
  "max_output_tokens": 8192,  # Set to the model's maximum
  "temperature": 0.2,         # Lower temperature for more predictable JSON
  "top_p": 1.0,
  "top_k": 32,
  "response_mime_type": "application/json" # Explicitly request JSON output
}


    response_chunks = model.generate_content(prompt, generation_config=generation_config,stream=True)

    # Assemble the full response from the streamed chunks
    full_response = "".join(chunk.text for chunk in response_chunks if hasattr(chunk, 'text'))

    return full_response

In [186]:
import time
start_time = time.time()
asset_group_results = classify_keywords_to_asset_group_themes(filtered_df.to_dict(orient='records'), model)
end_time = time.time()
print(f"Time taken for classification: {end_time - start_time} seconds")
print(asset_group_results)

Time taken for classification: 6.496326446533203 seconds
{"analysis": {"high_value_keyword_identification": "Keywords like \"chatgpt\" (despite high competition), \"chatgpt login in\", and those containing \"openai\" show high search volume with low competition.  The API-related keywords (\"openai api\", \"openai api key\") also present high-value opportunities, although with higher bid costs.", "pattern_recognition_and_theme_ideation": "The keywords fall into several categories:  direct searches for ChatGPT and OpenAI, login and access-related queries,  misspellings and variations of OpenAI, and broader AI-related searches.  This suggests themes like \"ChatGPT Access & Login\", \"OpenAI Products & Services\", \"General AI Information & Tools\", and \"AI-Specific Niches\".", "final_strategy_and_grouping_logic": "The final strategy involves four main themes.  \"ChatGPT Access\" focuses on direct searches and login issues.  \"OpenAI Ecosystem\" groups keywords related to OpenAI's product

In [187]:
# creating report

import json
import os
import re
from jinja2 import Environment

def parse_model_output(raw_output: str) -> dict:
    """
    Extracts and parses the JSON content from the model's raw output string.
    It robustly finds the JSON within the ```json ... ``` block.
    """
    # Regex to find the JSON block, handling potential variations
    match = re.search(r"```json\s*([\s\S]+?)\s*```", raw_output)
    if not match:
        # As a fallback, try to parse the whole string if no markers are found
        try:
            return json.loads(raw_output)
        except json.JSONDecodeError:
            raise ValueError("Could not find a JSON code block and the full text is not valid JSON.")
    
    json_string = match.group(1)
    
    try:
        return json.loads(json_string)
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to decode JSON from the code block: {e}\nContent was: {json_string}")


import os
from jinja2 import Environment

def create_report(data: dict, filename: str = "asset_group_themes.html"):
    """
    Generates a self-contained HTML report from the parsed data (simple version).
    """
    # This template is now simplified to handle a list of keyword strings.
    html_template_string = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Keyword Asset Group Strategy Report</title>
        <style>
            body { 
                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; 
                background-color: #f9fafb; 
                color: #1f2937; 
                margin: 0; 
                padding: 2rem; 
            }
            .container { 
                max-width: 900px; 
                margin: auto; 
                background: #ffffff; 
                padding: 2rem; 
                border-radius: 0.5rem; 
                box-shadow: 0 4px 6px -1px rgba(0,0,0,0.1), 0 2px 4px -2px rgba(0,0,0,0.1); 
            }
            h1 { 
                color: #1e40af; 
                border-bottom: 2px solid #3b82f6; 
                padding-bottom: 0.5rem; 
                margin-bottom: 1.5rem; 
                font-size: 2.25rem; 
            }
            h2 { 
                color: #1d4ed8; 
                margin-top: 2.5rem; 
                font-size: 1.875rem; 
            }
            h3 { 
                color: #1f2937; 
                margin-bottom: 1rem;
                font-size: 1.5rem;
                border-bottom: 1px solid #d1d5db;
                padding-bottom: 0.5rem;
            }
            .rationale { 
                font-style: italic; 
                color: #4b5563; 
                margin-bottom: 1.5rem; 
                padding: 1rem;
                border-left: 4px solid #60a5fa; 
                background-color: #eff6ff; 
                border-radius: 0.25rem;
            }
            .analysis-section { 
                background-color: #f3f4f6; 
                padding: 1.5rem; 
                border-radius: 0.5rem; 
                margin-bottom: 2rem; 
                border: 1px solid #e5e7eb; 
            }
            .theme-section {
                margin-bottom: 3rem;
            }
            ul {
                list-style-type: none;
                padding: 0;
            }
            li {
                background-color: #ffffff;
                border: 1px solid #e5e7eb;
                border-radius: 0.375rem;
                padding: 0.75rem 1rem;
                margin-bottom: 0.5rem;
                font-weight: 500;
            }
        </style>
    </head>
    <body>
        <div class="container">
            <h1>Keyword Strategy & Asset Group Report</h1>
            
            <div class="analysis-section">
                <h2>📝 Strategic Analysis</h2>
                <h3>High-Value Keyword Identification</h3>
                <p>{{ analysis.high_value_keyword_identification }}</p>
                <h3>Pattern Recognition & Theme Ideation</h3>
                <p>{{ analysis.pattern_recognition_and_theme_ideation }}</p>
                <h3>Final Strategy & Grouping Logic</h3>
                <p>{{ analysis.final_strategy_and_grouping_logic }}</p>
            </div>

            <h2>📂 Thematic Keyword Groups</h2>
            {% for theme in themed_keywords %}
            <div class="theme-section">
                <h3>{{ theme.theme_name }}</h3>
                <p class="rationale">{{ theme.rationale }}</p>
                
                <h4>Representative Keywords:</h4>
                <ul>
                    {% for keyword in theme.keywords %}
                    <li>{{ keyword }}</li>
                    {% endfor %}
                </ul>
            </div>
            {% endfor %}
        </div>
    </body>
    </html>
    """
    
    env = Environment()
    template = env.from_string(html_template_string)
    html_content = template.render(
        analysis=data.get('analysis', {}),
        themed_keywords=data.get('themed_keywords', [])
    )

    with open(filename, "w", encoding="utf-8") as f:
        f.write(html_content)
    
    print(f"✅ Report successfully generated: {os.path.abspath(filename)}")


In [188]:
# Parse the raw output to get the dictionary
report_data = parse_model_output(asset_group_results)

In [189]:
#  Generate the HTML report with the specified filename
create_report(report_data, filename="asset_group_themes1.html")

✅ Report successfully generated: /home/ameychoudhary4/Internship/Cubehq/asset_group_themes1.html
