In [3]:
import os
from dotenv import load_dotenv  
from Utilites import *
from runbook import *

load_dotenv()  

ANTHROPIC_API_KEY=os.getenv('ANTHROPIC_API_KEY')  
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  
PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY')   
FAL_KEY = os.getenv('FAL_KEY')


In [7]:
!pip install python-docx 

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
language="German"

In [None]:
course_name,course_link='Holistic Health Practitioner'
# course_link='https://scholistico.com/product/holistic-health-practitioner-online-training-course/'
course_link='https://de.scholistico.com/produkt/ausbildung-zum-ganzheitlichen-gesundheitspraktiker/'
get_topics_for_blog(course_name,course_link)

In [2]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse

def extract_links(url):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'xml')
    links = {}

    # Define excluded extensions
    excluded_extensions = ('.png', '.jpg', '.jpeg', '.webp', '.svg','.gif')

    for loc in soup.find_all('loc'):
        full_url = loc.text
        # Skip if URL ends with any of the excluded extensions
        if not full_url.lower().endswith(excluded_extensions):
            # Get the last part of the URL path as the key
            path = urlparse(full_url).path
            name = path.strip('/').split('/')[-1]
            if name:  # Only add if name is not empty
                links[name] = full_url

    return links

In [6]:
blog=[
'https://scholistico.com/post-sitemap.xml',
'https://de.scholistico.com/post-sitemap.xml',
'https://it.scholistico.com/post-sitemap.xml',
'https://nl.scholistico.com/post-sitemap.xml',
'https://es.scholistico.com/post-sitemap.xml',]
product=[
    'https://scholistico.com/product-sitemap.xml',
    'https://de.scholistico.com/product-sitemap.xml',
    'https://it.scholistico.com/product-sitemap.xml',
    'https://nl.scholistico.com/product-sitemap.xml',
    'https://es.scholistico.com/product-sitemap.xml'
]
lang=['en','de','it','nl','es']
for i in range(5):
    with open(f'details/{lang[i]}_blog.txt','w') as f:
        f.write(f'{extract_links(blog[i])}')
    with open(f'details/{lang[i]}_product.txt','w') as f:
        f.write(f'{extract_links(product[i])}')  

In [23]:
import json
import anthropic
import os
from pathlib import Path

def process_file(file_path, lang_code, client):
    # Define buckets
    buckets = {
        "Art Therapy": {},
        "Naturopathy": {},
        "Nutrition": {},
        "Holistic Health": {},
        "Sound Therapy": {},
        "Movement Therapy": {},
        "Animal Wellness": {},
        "Learning": {},
        "Marketing": {},
        "Business": {},
        "Financial Wellness": {},
        "Uncategorized": {}
    }
    
    # Read the input file
    with open(file_path, 'r', encoding='utf-8') as f:
        articles_dict = eval(f.read())
    
    # Process each article
    for title, url in articles_dict.items():
        if title == 'blog':
            continue
            
        # First get the category
        category_prompt = f"""
        Analyze this article title and categorize it into exactly one of these categories:
        {', '.join(buckets.keys())[:-14]}  # Excluding "Uncategorized"
        
        Article title (in {lang_code}): {title}
        
        Return only the category name, nothing else.
        """
        
        try:
            category_response = client.messages.create(
                model="claude-3-5-sonnet-20240620",
                max_tokens=100,
                temperature=0,
                messages=[
                    {"role": "user", "content": category_prompt}
                ]
            )
            
            bucket = category_response.content[0].text.strip()
            
            # If language is not English, get the translation
            if lang_code != 'en':
                translation_prompt = f"""
                Translate this title from {lang_code} to English:
                {title}
                
                Return only the English translation, nothing else.
                """
                
                translation_response = client.messages.create(
                    model="claude-3-5-sonnet-20240620",
                    max_tokens=200,
                    temperature=0,
                    messages=[
                        {"role": "user", "content": translation_prompt}
                    ]
                )
                
                translation = translation_response.content[0].text.strip()
                
                # Add the URL and translation as a comment
                if bucket in buckets:
                    buckets[bucket][title] = url + f'\n   #"{translation}"'
                else:
                    buckets["Uncategorized"][title] = url + f'\n   #"{translation}"'
            else:
                # For English content, just add the URL
                if bucket in buckets:
                    buckets[bucket][title] = url
                else:
                    buckets["Uncategorized"][title] = url
                
        except Exception as e:
            buckets["Uncategorized"][title] = url
            print(f"Error processing {title}: {str(e)}")
    
    # Create output file
    output_filename = f"{file_path.stem}_categorized.txt"
    with open(output_filename, 'w', encoding='utf-8') as f:
        for bucket, articles in buckets.items():
            if articles:  # Only write non-empty buckets
                f.write(f"\n#{bucket}\n")
                f.write(json.dumps(articles, indent=2, ensure_ascii=False))
                f.write("\n")

def main():
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    details_path = Path("details")
    
    # Process all txt files in details folder
    for file_path in details_path.glob("*.txt"):
        lang_code = file_path.stem.split('_')[0]
        print(f"Processing {file_path}...")
        process_file(file_path, lang_code, client)

if __name__ == "__main__":
    main()

Processing details/it_product.txt...
Processing details/nl_blog.txt...
Processing details/es_blog.txt...
Processing details/it_blog.txt...
Processing details/en_product.txt...
Processing details/en_blog.txt...
Processing details/de_product.txt...
Processing details/es_product.txt...
Processing details/de_blog.txt...
Processing details/nl_product.txt...


In [21]:
import json
import os
from pathlib import Path
import anthropic

def translate_and_verify(input_file_path):
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    
    # Create verification directory
    verification_dir = Path("verification")
    verification_dir.mkdir(exist_ok=True)
    
    # Read the input file
    with open(input_file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split content by sections (buckets)
    sections = content.split('\n#')[1:]  # Skip the first empty split
    
    translated_content = []
    
    for section in sections:
        if not section.strip():
            continue
            
        # Split into bucket name and content
        bucket_name, json_content = section.split('\n', 1)
        bucket_name = bucket_name.strip()
        
        # Parse the JSON content
        try:
            articles = json.loads(json_content)
            
            # Translate each title
            translated_articles = {}
            for title, url in articles.items():
                prompt = f"""
                Translate this title from {input_file_path.stem.split('_')[0]} to English:
                {title}
                
                Return only the English translation, nothing else.
                """
                
                try:
                    response = client.messages.create(
                        model="claude-3-5-sonnet-20240620",
                        max_tokens=200,
                        temperature=0,
                        messages=[
                            {"role": "user", "content": prompt}
                        ]
                    )
                    
                    translated_title = response.content[0].text.strip()
                    translated_articles[title] = {
                        "url": url,
                        "english_title": translated_title
                    }
                    
                except Exception as e:
                    print(f"Error translating title '{title}': {str(e)}")
                    translated_articles[title] = {
                        "url": url,
                        "english_title": "Translation failed"
                    }
            
            if translated_articles:
                translated_content.append({
                    "bucket": bucket_name,
                    "articles": translated_articles
                })
                
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON in {bucket_name}: {str(e)}")
    
    # Write translated content to verification file
    output_file = verification_dir / f"{input_file_path.stem}_verified.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        for section in translated_content:
            f.write(f"\n#{section['bucket']}\n")
            f.write(json.dumps(section['articles'], indent=2, ensure_ascii=False))
            f.write("\n")

def main():
    # Process all categorized files
    for file_path in Path().glob("*_categorized.txt"):
        print(f"Processing {file_path}...")
        translate_and_verify(file_path)

if __name__ == "__main__":
    main()

Processing it_product_categorized.txt...
Processing en_product_categorized.txt...
Processing en_blog_categorized.txt...
Processing nl_blog_categorized.txt...
Processing nl_product_categorized.txt...
Processing es_product_categorized.txt...
Processing it_blog_categorized.txt...
Processing de_product_categorized.txt...
Processing de_blog_categorized.txt...
Processing es_blog_categorized.txt...


In [39]:
import pandas as pd
import json
from pathlib import Path
import anthropic
import os
from typing import Dict, Optional

# Constants
COURSE_MAPPING = {
    "Art Therapy": "AT",
    "Naturopathy": "NAT",
    "Nutrition": "HNC",
    "Holistic Health": "HHP",
    "Sound Therapy": "ST",
    "Movement Therapy": "MT",
    "Animal Communication": "ACS"
}

COURSE_NAME_MAPPING = {
    "Art Therapy": "Art Therapy Practitioner",
    "Naturopathy": "Naturopathy",
    "Nutrition": "Holistic Nutrition Consultant",
    "Holistic Health": "Holistic Health Practioner",
    "Sound Therapy": "Sound Therapy",
    "Movement Therapy": "Movement Therapy",
    "Animal Communication": "Animal Communication Specialist"
}

LANGUAGE_MAPPING = {
    'en': 'English',
    'es': 'Spanish',
    'de': 'German',
    'it': 'Italian',
    'nl': 'Dutch'
}

def read_file_safely(file_path: Path) -> Optional[str]:
    """Safely read file content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return None

def parse_content(content: str) -> Dict:
    """Parse content and extract articles by bucket"""
    buckets = {}
    current_bucket = None
    current_content = []
    
    try:
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            if line.startswith('#'):
                if current_bucket and current_content:
                    try:
                        content_str = ''.join(current_content).strip()
                        buckets[current_bucket] = json.loads(content_str)
                    except json.JSONDecodeError:
                        print(f"Error parsing JSON for bucket {current_bucket}")
                
                current_bucket = line[1:].strip()
                current_content = []
            elif line:
                current_content.append(line)
        
        # Process last bucket
        if current_bucket and current_content:
            try:
                content_str = ''.join(current_content).strip()
                buckets[current_bucket] = json.loads(content_str)
            except json.JSONDecodeError:
                print(f"Error parsing JSON for bucket {current_bucket}")
    
    except Exception as e:
        print(f"Error parsing content: {str(e)}")
    
    return buckets

def create_course_dataframe():
    data = []
    
    # Get all unique language codes
    files = list(Path().glob("*_categorized.txt"))
    lang_codes = set(file.stem.split('_')[0] for file in files)
    
    print(f"Found files: {[f.name for f in files]}")
    print(f"Processing languages: {lang_codes}")
    
    for lang_code in lang_codes:
        print(f"\nProcessing {lang_code}...")
        
        # Read blog and product files
        blog_file = Path(f"{lang_code}_blog_categorized.txt")
        product_file = Path(f"{lang_code}_product_categorized.txt")
        
        blog_content = read_file_safely(blog_file)
        product_content = read_file_safely(product_file)
        
        if not blog_content and not product_content:
            print(f"No content found for {lang_code}")
            continue
        
        # Parse content
        blog_buckets = parse_content(blog_content) if blog_content else {}
        product_buckets = parse_content(product_content) if product_content else {}

        # Process each course type
        for course_type, code in COURSE_MAPPING.items():
            blog_articles = blog_buckets.get(course_type, {})
            product_articles = product_buckets.get(course_type, {})
            
            # Only create entry if we have either blog or product content
            if blog_articles or product_articles:
                # Find course link from product articles
                course_link = ''
                for url in product_articles.values():
                    if isinstance(url, str) and ('product' in url or 'produkt' in url or 'prodotto' in url or 'producto' in url):
                        course_link = url.split('\n')[0]  # Take only the URL part
                        break
                
                row = {
                    'course_code': code,
                    'language': LANGUAGE_MAPPING.get(lang_code, 'Unknown'),
                    'country_code': lang_code,
                    'website': f"{lang_code}.scholistico",
                    'course_name': COURSE_NAME_MAPPING.get(course_type, ''),
                    'course_link': course_link,
                    'result_blog_dict': str(blog_articles),
                    'results_product_dict': str(product_articles),
                    'inspiration_blog_dict': str(blog_articles),
                    'course_document_path': f"courses/{code}"
                }
                data.append(row)
                print(f"Added {course_type} for {lang_code}")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort DataFrame
    if not df.empty:
        df = df.sort_values(['course_code', 'country_code'])
    
    return df

def save_dataframe(df: pd.DataFrame, output_path: str = 'course_data.csv'):
    """Save DataFrame to CSV with both full and readable versions"""
    if df.empty:
        print("Warning: DataFrame is empty, no files will be saved.")
        return
    
    # Create output directory if it doesn't exist
    output_dir = Path(output_path).parent
    output_dir.mkdir(exist_ok=True)
    
    # Clean up the dictionary strings
    df_clean = df.copy()
    for col in ['result_blog_dict', 'results_product_dict', 'inspiration_blog_dict']:
        df_clean[col] = df_clean[col].apply(lambda x: x.replace('\n', '').replace('   #', ' #'))
    
    # Save full version
    df_clean.to_csv(output_path, index=False)
    print(f"Saved full data to: {output_path}")
    
    # Save readable version
    readable_path = str(Path(output_path).with_suffix('')) + '_readable.csv'
    df_readable = df_clean.copy()
    for col in ['result_blog_dict', 'results_product_dict', 'inspiration_blog_dict']:
        df_readable[col] = df_readable[col].apply(lambda x: x[:100] + '...' if len(x) > 100 else x)
    df_readable.to_csv(readable_path, index=False)
    print(f"Saved readable version to: {readable_path}")

def verify_data_quality(df: pd.DataFrame):
    """Verify the quality of the generated DataFrame"""
    if df.empty:
        print("\nData Quality Report: DataFrame is empty")
        return
    
    print("\nData Quality Report:")
    print("-" * 50)
    
    # Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Check course distribution
    print("\nCourse Distribution:")
    print(df['course_code'].value_counts())
    
    # Check language distribution
    print("\nLanguage Distribution:")
    print(df['language'].value_counts())
    
    # Verify dictionary fields


    # Verify dictionary fields
    dict_fields = ['result_blog_dict', 'results_product_dict', 'inspiration_blog_dict']
    print("\nDictionary Fields Verification:")
    for field in dict_fields:
        empty_dicts = df[df[field] == '{}'].shape[0]
        print(f"{field}: {empty_dicts} empty dictionaries")
        
    # Verify course links
    print("\nCourse Links Verification:")
    missing_links = df[df['course_link'] == ''].shape[0]
    print(f"Entries without course links: {missing_links}")

def main():
    try:
        # Create output directory if it doesn't exist
        Path('output').mkdir(exist_ok=True)
        
        # Create DataFrame
        print("Creating DataFrame from categorized files...")
        df = create_course_dataframe()
        
        # Verify data quality
        verify_data_quality(df)
        
        # Save to CSV
        print("\nSaving DataFrame to CSV...")
        save_dataframe(df, 'output/course_data.csv')
        
        print("\nProcess completed successfully!")
        print(f"Total courses processed: {len(df)}")
        
        # Print sample of the data
        print("\nSample of processed data:")
        if not df.empty:
            sample_df = df.head()
            for col in ['result_blog_dict', 'results_product_dict', 'inspiration_blog_dict']:
                sample_df[col] = sample_df[col].apply(lambda x: x[:50] + '...' if len(x) > 50 else x)
            print(sample_df)
        
    except Exception as e:
        print(f"\nAn error occurred during processing: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    # Set pandas display options for better output
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    
    # Run main function
    main()

Creating DataFrame from categorized files...
Found files: ['it_product_categorized.txt', 'en_product_categorized.txt', 'en_blog_categorized.txt', 'nl_blog_categorized.txt', 'nl_product_categorized.txt', 'es_product_categorized.txt', 'it_blog_categorized.txt', 'de_product_categorized.txt', 'de_blog_categorized.txt', 'es_blog_categorized.txt']
Processing languages: {'en', 'nl', 'it', 'de', 'es'}

Processing en...
Error parsing JSON for bucket Art Therapy
Error parsing JSON for bucket Naturopathy
Error parsing JSON for bucket Nutrition
Error parsing JSON for bucket Holistic Health
Error parsing JSON for bucket Sound Therapy
Error parsing JSON for bucket Movement Therapy
Error parsing JSON for bucket Financial Wellness
Added Art Therapy for en
Added Naturopathy for en
Added Nutrition for en
Added Holistic Health for en

Processing nl...
Error parsing JSON for bucket Art Therapy
Error parsing JSON for bucket Naturopathy
Error parsing JSON for bucket Nutrition
Error parsing JSON for bucket H

Starting processing...

Processing it_product_categorized.txt...
Processing Art Therapy...
Error processing blog content for Art Therapy: Expecting value: line 1 column 1 (char 0)
Error processing product content for Art Therapy: Expecting value: line 1 column 1 (char 0)
Processing Naturopathy...
Error processing blog content for Naturopathy: Expecting value: line 1 column 1 (char 0)
Error processing product content for Naturopathy: Expecting value: line 1 column 1 (char 0)
Processing Nutrition...


KeyboardInterrupt: 

In [51]:
def process_blog_content(client, bucket_content, bucket_name):
    prompt = f"""
    Given this content bucket, extract only the blog articles and format them as a JSON dictionary.

    Content Bucket: {bucket_name}
    Content:
    {bucket_content}

    Instructions:
    1. Only include URLs that do NOT contain '/product/', '/produkt/', '/prodotto/', or '/producto/'
    2. Format as a JSON dictionary with title as key and URL as value
    3. Return only the JSON dictionary, nothing else

    Example format:
    {{"blog-title-1": "https://example.com/blog/article-1", "blog-title-2": "https://example.com/blog/article-2"}}
    """
    
    try:
        response = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=2000,
            temperature=0,
            messages=[{"role": "user", "content": prompt}]
        )
        
        response_text = response.content[0].text.strip()
        print(f"\nBlog Response for {bucket_name}:")
        print(response_text)
        
        try:
            return ast.literal_eval(response_text)
        except:
            return json.loads(response_text)
            
    except Exception as e:
        print(f"Error processing blog content for {bucket_name}: {str(e)}")
        return {}

def process_product_content(client, bucket_content, bucket_name):
    prompt = f"""
    Given this content bucket, extract only the product/course URLs and format them as a JSON dictionary.

    Content Bucket: {bucket_name}
    Content:
    {bucket_content}

    Instructions:
    1. Only include URLs that contain '/product/', '/produkt/', '/prodotto/', or '/producto/'
    2. Format as a JSON dictionary with title as key and URL as value
    3. Return only the JSON dictionary, nothing else

    Example format:
    {{"product-title-1": "https://example.com/product/course-1", "product-title-2": "https://example.com/product/course-2"}}
    """
    
    try:
        response = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=2000,
            temperature=0,
            messages=[{"role": "user", "content": prompt}]
        )
        
        response_text = response.content[0].text.strip()
        print(f"\nProduct Response for {bucket_name}:")
        print(response_text)
        
        try:
            return ast.literal_eval(response_text)
        except:
            return json.loads(response_text)
            
    except Exception as e:
        print(f"Error processing product content for {bucket_name}: {str(e)}")
        return {}

def create_df():
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    data = []
    
    for file in Path().glob("*_categorized.txt"):
        lang_code = file.stem.split('_')[0]
        file_type = file.stem.split('_')[1]
        print(f"\nProcessing {file.name}...")
        
        with open(file, 'r') as f:
            content = f.read()
        
        sections = parse_section(content)
        
        for bucket, bucket_content in sections.items():
            if bucket in COURSE_MAPPING:
                print(f"\nProcessing {bucket}...")
                print(f"Content:\n{bucket_content[:200]}...")
                
                blog_dict = process_blog_content(client, bucket_content, bucket)
                product_dict = process_product_content(client, bucket_content, bucket)
                
                row = {
                    'course_code': COURSE_MAPPING[bucket],
                    'language': LANGUAGE_MAPPING[lang_code],
                    'country_code': lang_code,
                    'website': f"{lang_code}.scholistico",
                    'course_name': COURSE_NAME_MAPPING[bucket],
                    'course_link': '',
                    'result_blog_dict': str(blog_dict),
                    'results_product_dict': str(product_dict),
                    'inspiration_blog_dict': str(blog_dict),
                    'course_document_path': f"courses/{COURSE_MAPPING[bucket]}"
                }
                
                if product_dict:
                    row['course_link'] = list(product_dict.values())[0]
                
                data.append(row)
                print(f"Added row for {bucket} - {lang_code}")
    
    df = pd.DataFrame(data)
    if not df.empty:
        df = df.groupby(['course_code', 'language', 'country_code', 'website', 'course_name', 'course_document_path']).agg({
            'course_link': 'first',
            'result_blog_dict': 'first',
            'results_product_dict': 'first',
            'inspiration_blog_dict': 'first'
        }).reset_index()
    
    return df

if __name__ == "__main__":
    print("Starting processing...")
    df = create_df()
    
    # Save full version
    df.to_csv('course_data_ai.csv', index=False)
    
    # Save readable version
    df_readable = df.copy()
    for col in ['result_blog_dict', 'results_product_dict', 'inspiration_blog_dict']:
        df_readable[col] = df_readable[col].apply(lambda x: x[:100] + '...' if len(x) > 100 else x)
    df_readable.to_csv('course_data_ai_readable.csv', index=False)
    
    print("\nProcessing complete!")
    print(f"Total rows: {len(df)}")
    print("\nSample of processed data:")
    print(df[['course_code', 'language', 'course_name']].head())

Starting processing...

Processing it_product_categorized.txt...

Processing Art Therapy...
Content:
{
  "corso-di-formazione-in-arteterapia": "https://it.scholistico.com/prodotto/corso-di-formazione-in-arteterapia/",
  "oltre-50-esercizi-di-arteterapia-per-bambini-adulti-e-anziani": "https://it.scho...

Blog Response for Art Therapy:
{}

Product Response for Art Therapy:
{
  "corso-di-formazione-in-arteterapia": "https://it.scholistico.com/prodotto/corso-di-formazione-in-arteterapia/",
  "oltre-50-esercizi-di-arteterapia-per-bambini-adulti-e-anziani": "https://it.scholistico.com/prodotto/oltre-50-esercizi-di-arteterapia-per-bambini-adulti-e-anziani/",
  "guadagnare-attraverso-larte": "https://it.scholistico.com/prodotto/guadagnare-attraverso-larte/"
}
Added row for Art Therapy - it

Processing Naturopathy...
Content:
{
  "corso-di-formazione-per-operatori-di-naturopatia": "https://it.scholistico.com/prodotto/corso-di-formazione-per-operatori-di-naturopatia/",
  "oltre-50-rimedi-natura

In [52]:
def get_section_dict(filename, section_name):
    with open(filename, 'r') as file:
        content = file.read()
    
    sections = content.split('#')
    
    for section in sections:
        if section.strip():
            parts = section.strip().split('\n', 1)
            current_section_name = parts[0].strip()
            if current_section_name == section_name:
                return parts[1].strip()
    
    return "{}"

# Example usage:
result = get_section_dict('de_blog_categorized.txt', 'Art Therapy')
print(result)

{
  "7-kunsttherapieuebungen-fuer-menschen-mit-behinderungen": "https://de.scholistico.com/7-kunsttherapieuebungen-fuer-menschen-mit-behinderungen/",
  "7-kunsttherapieuebungen-fuer-anfaenger": "https://de.scholistico.com/7-kunsttherapieuebungen-fuer-anfaenger/",
  "die-10-besten-kunsttherapie-aktivitaeten-fuer-paare-zur-staerkung-ihrer-bindung": "https://de.scholistico.com/die-10-besten-kunsttherapie-aktivitaeten-fuer-paare-zur-staerkung-ihrer-bindung/",
  "10-kunsttherapie-uebungen-fuer-die-traumaheilung": "https://de.scholistico.com/10-kunsttherapie-uebungen-fuer-die-traumaheilung/",
  "10-beste-kunsttherapie-aktivitaeten-zum-stressabbau": "https://de.scholistico.com/10-beste-kunsttherapie-aktivitaeten-zum-stressabbau/",
  "der-ultimative-guide-zu-kunsttherapeutischen-aktivitaeten-fuer-schlaf-und-erholsame-naechte": "https://de.scholistico.com/der-ultimative-guide-zu-kunsttherapeutischen-aktivitaeten-fuer-schlaf-und-erholsame-naechte/",
  "die-schnittstelle-zwischen-kunsttherapie-pe

In [None]:
        {"AT":"Art Therapy",
        "NAT":"Naturopathy",
        "HNC":"Nutrition",
        "HHP":"Holistic Health",
        "ST":"Sound Therapy",
        "MT":"Movement Therapy",
        "ACS":"Animal Wellness"
        }


In [53]:
import pandas as pd
from pathlib import Path

COURSE_MAPPING = {
    "AT": "Art Therapy",
    "NAT": "Naturopathy",
    "HNC": "Nutrition",
    "HHP": "Holistic Health",
    "ST": "Sound Therapy",
    "MT": "Movement Therapy",
    "ACS": "Animal Wellness"
}

LANGUAGE_MAPPING = {
    'en': 'English',
    'es': 'Spanish', 
    'de': 'German',
    'it': 'Italian',
    'nl': 'Dutch'
}

def get_section_dict(filename, section_name):
    with open(filename, 'r') as file:
        content = file.read()
    
    sections = content.split('#')
    
    for section in sections:
        if section.strip():
            parts = section.strip().split('\n', 1)
            current_section_name = parts[0].strip()
            if current_section_name == section_name:
                return parts[1].strip()
    
    return "{}"

def create_df():
    data = []
    
    # Reverse COURSE_MAPPING for lookup
    REVERSE_MAPPING = {v: k for k, v in COURSE_MAPPING.items()}
    
    for file in Path().glob("*_categorized.txt"):
        lang_code = file.stem.split('_')[0]
        file_type = file.stem.split('_')[1]
        
        for section_name in COURSE_MAPPING.values():
            section_dict = get_section_dict(str(file), section_name)
            
            row = {
                'course_code': REVERSE_MAPPING[section_name],
                'language': LANGUAGE_MAPPING[lang_code],
                'country_code': lang_code,
                'website': f"{lang_code}.scholistico",
                'course_name': section_name,
                'course_link': '',
                'result_blog_dict': '{}',
                'results_product_dict': '{}',
                'inspiration_blog_dict': '{}',
                'course_document_path': f"courses/{REVERSE_MAPPING[section_name]}"
            }
            
            if file_type == 'blog':
                row['result_blog_dict'] = section_dict
                row['inspiration_blog_dict'] = section_dict
            else:
                row['results_product_dict'] = section_dict
                if section_dict != "{}":
                    # Extract first URL as course link
                    try:
                        import ast
                        dict_content = ast.literal_eval(section_dict)
                        if dict_content:
                            row['course_link'] = list(dict_content.values())[0]
                    except:
                        pass
            
            data.append(row)
    
    df = pd.DataFrame(data)
    df = df.groupby(['course_code', 'language', 'country_code', 'website', 'course_name', 'course_document_path']).agg({
        'course_link': 'first',
        'result_blog_dict': 'first',
        'results_product_dict': 'first',
        'inspiration_blog_dict': 'first'
    }).reset_index()
    
    return df

if __name__ == "__main__":
    df = create_df()
    df.to_csv('course_data.csv', index=False)
    print(f"\nProcessed {len(df)} rows")
    print("\nSample data:")
    print(df[['course_code', 'language', 'course_name']].head())


Processed 35 rows

Sample data:
  course_code language      course_name
0         ACS    Dutch  Animal Wellness
1         ACS  English  Animal Wellness
2         ACS   German  Animal Wellness
3         ACS  Italian  Animal Wellness
4         ACS  Spanish  Animal Wellness


In [54]:

import pandas as pd
from pathlib import Path
import json
import ast

COURSE_MAPPING = {
    "AT": "Art Therapy",
    "NAT": "Naturopathy",
    "HNC": "Nutrition",
    "HHP": "Holistic Health",
    "ST": "Sound Therapy",
    "MT": "Movement Therapy",
    "ACS": "Animal Wellness"
}

LANGUAGE_MAPPING = {
    'en': 'English',
    'es': 'Spanish', 
    'de': 'German',
    'it': 'Italian',
    'nl': 'Dutch'
}

def get_section_dict(filename, section_name):
    with open(filename, 'r') as file:
        content = file.read()
    
    sections = content.split('#')
    
    for section in sections:
        if section.strip():
            parts = section.strip().split('\n', 1)
            current_section_name = parts[0].strip()
            if current_section_name == section_name:
                return parts[1].strip()
    
    return "{}"

def process_files():
    data = []
    REVERSE_MAPPING = {v: k for k, v in COURSE_MAPPING.items()}
    
    # First, get all language codes
    lang_codes = set()
    for file in Path().glob("*_categorized.txt"):
        lang_code = file.stem.split('_')[0]
        lang_codes.add(lang_code)
    
    # Process each course type for each language
    for section_name in COURSE_MAPPING.values():
        for lang_code in lang_codes:
            blog_file = f"{lang_code}_blog_categorized.txt"
            product_file = f"{lang_code}_product_categorized.txt"
            
            # Get blog and product content for this section
            blog_content = get_section_dict(blog_file, section_name) if Path(blog_file).exists() else "{}"
            product_content = get_section_dict(product_file, section_name) if Path(product_file).exists() else "{}"
            
            # Create row only if we have content
            if blog_content != "{}" or product_content != "{}":
                course_code = REVERSE_MAPPING[section_name]
                
                # Try to get course link from product content
                course_link = ""
                if product_content != "{}":
                    try:
                        product_dict = ast.literal_eval(product_content)
                        if product_dict:
                            course_link = list(product_dict.values())[0]
                    except:
                        pass
                
                row = {
                    'course_code': course_code,
                    'language': LANGUAGE_MAPPING[lang_code],
                    'country_code': lang_code,
                    'website': f"{lang_code}.scholistico",
                    'course_name': section_name,
                    'course_link': course_link,
                    'result_blog_dict': blog_content,
                    'results_product_dict': product_content,
                    'inspiration_blog_dict': blog_content,
                    'course_document_path': f"courses/{course_code}"
                }
                data.append(row)
                print(f"Processed {section_name} for {lang_code}")
    
    return pd.DataFrame(data)

if __name__ == "__main__":
    print("Processing files...")
    df = process_files()
    
    # Save both full and readable versions
    df.to_csv('course_data_full.csv', index=False)
    

# Create readable version with truncated dictionaries
    df_readable = df.copy()
    for col in ['result_blog_dict', 'results_product_dict', 'inspiration_blog_dict']:
        df_readable[col] = df_readable[col].apply(lambda x: x[:100] + '...' if len(x) > 100 else x)
    df_readable.to_csv('course_data_readable.csv', index=False)
    
    # Print summary
    print("\nProcessing complete!")
    print(f"Total rows: {len(df)}")
    print("\nCourse distribution:")
    print(df['course_code'].value_counts())
    print("\nLanguage distribution:")
    print(df['language'].value_counts())
    
    # Print sample
    print("\nSample data (first 2 rows):")
    sample_cols = ['course_code', 'language', 'course_name', 'course_link']
    print(df[sample_cols].head(2))
    
    # Verify data quality
    print("\nVerifying data quality...")
    print("Empty dictionaries:")
    for col in ['result_blog_dict', 'results_product_dict']:
        empty_count = df[df[col] == '{}'].shape[0]
        print(f"{col}: {empty_count}")
    
    print("\nMissing course links:", df[df['course_link'] == ''].shape[0])

Processing files...
Processed Art Therapy for en
Processed Art Therapy for nl
Processed Art Therapy for it
Processed Art Therapy for de
Processed Art Therapy for es
Processed Naturopathy for en
Processed Naturopathy for nl
Processed Naturopathy for it
Processed Naturopathy for de
Processed Naturopathy for es
Processed Nutrition for en
Processed Nutrition for nl
Processed Nutrition for it
Processed Nutrition for de
Processed Nutrition for es
Processed Holistic Health for en
Processed Holistic Health for nl
Processed Holistic Health for it
Processed Holistic Health for de
Processed Holistic Health for es
Processed Sound Therapy for en
Processed Movement Therapy for en
Processed Movement Therapy for nl
Processed Animal Wellness for en
Processed Animal Wellness for nl
Processed Animal Wellness for it
Processed Animal Wellness for de
Processed Animal Wellness for es

Processing complete!
Total rows: 28

Course distribution:
course_code
AT     5
NAT    5
HNC    5
HHP    5
ACS    5
MT     2
S

In [5]:
import streamlit as st
import pandas as pd
import json
import zipfile
import os
import ast 
from io import BytesIO
from datetime import datetime
import os
from dotenv import load_dotenv  
from Utilites import *
from runbook import *
import pandas as pd
load_dotenv()  
import requests
from requests.auth import HTTPBasicAuth

ANTHROPIC_API_KEY=os.getenv('ANTHROPIC_API_KEY')  
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  
PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY')   
FAL_KEY = os.getenv('FAL_KEY')


import random
import ast

def read_input_file(file_path: str) -> Dict:
    """Read and parse the input JSON file."""
    with open(file_path, 'r') as file:
        return json.load(file)

def get_random_values_from_string(dictionary_string, num_values):
  try:
    dictionary = ast.literal_eval(dictionary_string)
  except (SyntaxError, ValueError):
    print("Invalid dictionary string format.")
    return {}  # Return an empty dictionary in case of invalid input

  num_values_to_extract = min(num_values, len(dictionary))

  # Create a list of randomly selected keys
  random_keys = random.sample(list(dictionary.keys()), num_values_to_extract)

  # Create a new dictionary with the selected key-value pairs
  random_dict = {key: dictionary[key] for key in random_keys}
  new_keys=map(lambda x: x.replace('-',' ').title(),list(random_dict.keys()))
  random_dict1 = dict(zip(new_keys, random_dict.values()))

  return random_dict1

def GenerateBlogStructure(topic,
                          course_name,
                          course_link,
                          results_blog_dict,
                          results_product_dict,
                          num_of_words_in_blog,
                          num_of_images_in_blog,
                          num_of_headings,
                          blog_percent,
                          promotion_percent,
                          num_of_infographics,
                          number_of_case_studies,
                          inpiration_blog_dict,
                          number_of_tables,
                          num_of_outbound_links,
                          language
                          ):
    blog_content = generate_blog_post(topic,
                                      course_name,
                                      course_link,
                                      results_blog_dict,
                                      results_product_dict,
                                      num_of_words_in_blog,
                                      num_of_images_in_blog,
                                      num_of_headings,
                                      blog_percent,
                                      promotion_percent,
                                      num_of_infographics,
                                      number_of_case_studies,
                                      inpiration_blog_dict.values(),
                                      number_of_tables,
                                      num_of_outbound_links,
                                      language
                                      )


    blog_content=add_content_id(blog_content)

    structure_path='Blogs/Organization/html/blog_structure/'



    blog_identifier=save_html(blog_content,structure_path)


    return blog_identifier

# def SaveImages(blog_identifier):
#     structure_path='Blogs/Organization/html/blog_structure/'
#     with open(f"{structure_path}{blog_identifier}.html") as f:
#           blog_content=f.read()

#     prompts = extract_dalle_prompts(blog_content)
#     images= generate_images(prompts, "Create Ultra Realistic images, No cartoonish images")


#     image_path=f'Blogs/Organization/images/{blog_identifier}/'
#     json_path=f'Blogs/Organization/json/{blog_identifier}.json'

#     image_dict = create_image_dict(blog_content, images)
#     image_path_dict = save_images(image_dict, image_path)
#     save_image_paths(image_path_dict, json_path)

# def ProduceFinalOutput(blog_identifier,website):
#     json_path=f'Blogs/Organization/json/{blog_identifier}.json'
#     structure_path='Blogs/Organization/html/blog_structure/'
#     with open(f"{structure_path}{blog_identifier}.html") as f:
#           blog_content=f.read()
#     updated_html = update_html_with_images(blog_content, json_path,website)
#     print("Add styling")
#     # final_html = add_styling(updated_html)
#     # final_html=updated_html.replace('<img','<figure')
#     updated_html=create_formatted_html(updated_html)
#     output_path='Blogs/Organization/html/final_output/'
#     save_html(updated_html,output_path,blog_identifier)

def SaveImages(blog_identifier):
    structure_path='Blogs/Organization/html/blog_structure/'
    with open(f"{structure_path}{blog_identifier}.html") as f:
          blog_content=f.read()

    prompts = extract_dalle_prompts(blog_content)
    images= generate_images(prompts, "Create Ultra Realistic images, No cartoonish images")

    image_titles=generate_titles_for_prompts(prompts,client)

    image_path=f'Blogs/Organization/images/{blog_identifier}/'
    json_path=f'Blogs/Organization/json/{blog_identifier}.json'

    title_dict,image_dict = create_image_dict(blog_content, images, image_titles)
    image_path_dict = save_images(image_dict, image_path,title_dict)
    save_image_paths(image_path_dict, json_path)

def ProduceFinalOutput(blog_identifier,website):
    json_path=f'Blogs/Organization/json/{blog_identifier}.json'
    structure_path='Blogs/Organization/html/blog_structure/'
    with open(f"{structure_path}{blog_identifier}.html") as f:
          blog_content=f.read()
    updated_html = update_html_with_images(blog_content, json_path,website)
    print("Add styling")
    # final_html = add_styling(updated_html)
    # final_html=updated_html.replace('<img','<figure')
    updated_html=create_formatted_html(updated_html)
    output_path='Blogs/Organization/html/final_output/'
    save_html(updated_html,output_path,blog_identifier)

def DeleteImage(blog_identifier, Content_id):
    image_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'

    width = height = None  # Initialize width and height

    try:
        if os.path.exists(image_path):
            # Get image size before deleting
            with Image.open(image_path) as img:
                width, height = img.size

            os.remove(image_path)
            print(f"Image deleted successfully: {image_path}")
        else:
            print(f"Image not found: {image_path}")
    except Exception as e:
        print(f"Error deleting image: {e}")

    return width, height


cloudinary.config(
    cloud_name = "ddzaqiihn",
    api_key = "626813739546286",
    api_secret = "MXztcr6KdWsM7XnYGLdBeXegnIw"
)


def AddImage(source_path, blog_identifier, Content_id, width=None, height=None):
    destination_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'

    try:
        # Ensure the destination directory exists
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)

        # Upload the image to Cloudinary
        upload_result = cloudinary.uploader.upload(source_path)

        # Generate the Cloudinary URL with transformations
        transformation = []
        if width and height:
            transformation = [
                {'width': width, 'height': height, 'crop': 'fill'}
            ]

        resized_url = cloudinary.utils.cloudinary_url(
            upload_result['public_id'],
            transformation=transformation
        )[0]

        # Download the image from Cloudinary
        response = requests.get(resized_url)
        if response.status_code == 200:
            # Save the image content to the destination path
            with open(destination_path, 'wb') as f:
                f.write(response.content)

            print(f"Image added successfully: {destination_path}")
            return resized_url
        else:
            print(f"Failed to download image from Cloudinary. Status code: {response.status_code}")

    except FileNotFoundError:
        print(f"Source file not found: {source_path}")
    except PermissionError:
        print(f"Permission denied. Unable to upload the file.")
    except Exception as e:
        print(f"Error adding image: {e}")

    return None


def UpdateBlogStructure(html_identifier, content_id, updated_text):
    html_file_path=f'Blogs/Organization/html/blog_structure/{html_identifier}.html'
    # Read HTML from the identifier (file path)
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the element with the specified content_id
    element = soup.find(content_id=content_id)

    if element:
        # Update the text content of the element
        element.string = updated_text

        # Write the updated HTML back to the file
        with open(html_file_path, 'w', encoding='utf-8') as file:
            file.write(str(soup))

        print(f"HTML updated successfully. Content with ID '{content_id}' has been updated.")
    else:
        print(f"Element with ID '{content_id}' not found in the HTML.")

from PIL import Image, ImageDraw, ImageFont
import textwrap
from typing import Tuple
import numpy as np
import PIL

def get_multiline_text_size(text, font, draw):
    """Calculate the width and height of multi-line text."""
    lines = text.split("\n")
    max_width = 0
    total_height = 0

    for line in lines:
        bbox = draw.textbbox((0, 0), line, font=font)
        line_width = bbox[2] - bbox[0]
        line_height = bbox[3] - bbox[1]
        max_width = max(max_width, line_width)
        total_height += line_height

    return max_width, total_height

def create_simple_overlay(image_path, output_path, title, subtitle,
                          margin_ratio=0.1,
                          box_fill_color=(93, 108, 50, 153),
                          border_color="white",
                          border_thickness=2,
                          inner_margin_ratio=0.05):
    # Load the image
    background = Image.open(image_path).convert("RGBA")
    img_width, img_height = background.size

    # Calculate the margins
    margin_x = int(img_width * margin_ratio)
    margin_y = int(img_height * margin_ratio)

    # Calculate square size based on available height
    square_size = img_height - (2 * margin_y)

    # Coordinates for the square box
    left = margin_x
    top = margin_y
    right = left + square_size
    bottom = top + square_size

    # Calculate inner border coordinates (5% inside)
    border_offset = int(square_size * inner_margin_ratio)
    inner_left = left + border_offset
    inner_top = top + border_offset
    inner_right = right - border_offset
    inner_bottom = bottom - border_offset

    # Draw the overlay
    overlay = Image.new("RGBA", background.size, (255, 255, 255, 0))
    draw = ImageDraw.Draw(overlay)

    # Draw main box
    draw.rectangle([left, top, right, bottom], fill=box_fill_color)

    # Draw inner border
    draw.rectangle([inner_left, inner_top, inner_right, inner_bottom],
                  outline=border_color, width=border_thickness)

    # Set initial font size
    font_path = "Merriweather-Bold.ttf"
    title_font_size = 20
    subtitle_font_size = 20
    title_font = ImageFont.truetype(font_path, title_font_size)
    subtitle_font = ImageFont.truetype(font_path, subtitle_font_size)

    # Adjust the text area to fit within the inner border
    text_area_width = inner_right - inner_left
    text_area_height = inner_bottom - inner_top

    # Dynamically adjust the title font size for the new text area
    while True:
        title_width, title_height = get_multiline_text_size(title, title_font, draw)
        if title_width <= text_area_width * 0.9 and title_height <= text_area_height * 0.6:
            title_font_size += 2
            title_font = ImageFont.truetype(font_path, title_font_size)
        else:
            title_font_size -= 2
            title_font = ImageFont.truetype(font_path, title_font_size)
            break

    # Dynamically adjust the subtitle font size
    while True:
        subtitle_width, subtitle_height = get_multiline_text_size(subtitle, subtitle_font, draw)
        if subtitle_width <= square_size * 0.9 and subtitle_height <= square_size * 0.3:
            subtitle_font_size += 2
            subtitle_font = ImageFont.truetype(font_path, subtitle_font_size)
        else:
            subtitle_font_size -= 2
            subtitle_font = ImageFont.truetype(font_path, subtitle_font_size)
            break
    # Center text within the square
    title_width, title_height = get_multiline_text_size(title, title_font, draw)
    subtitle_width, subtitle_height = get_multiline_text_size(subtitle, subtitle_font, draw)

    # Define spacing between title and subtitle (adjust this value as needed)
    title_subtitle_spacing = square_size * 0.1  # 10% of square size for spacing

    # Adjust font sizes
    # Make subtitle smaller relative to title
    while True:
        subtitle_width, subtitle_height = get_multiline_text_size(subtitle, subtitle_font, draw)
        if subtitle_width <= square_size * 0.8:  # Making subtitle width smaller than title
            break
        subtitle_font_size -= 2
        subtitle_font = ImageFont.truetype(font_path, subtitle_font_size)

    # Ensure subtitle is proportionally smaller than title
    if subtitle_font_size > title_font_size * 0.6:  # Subtitle will be 60% of title size
        subtitle_font_size = int(title_font_size * 0.6)
        subtitle_font = ImageFont.truetype(font_path, subtitle_font_size)
        subtitle_width, subtitle_height = get_multiline_text_size(subtitle, subtitle_font, draw)

    # Calculate text positions with new spacing
    total_content_height = title_height + title_subtitle_spacing + subtitle_height
    start_y = top + (square_size - total_content_height) // 2

    # Position for title
    title_x = left + (square_size - title_width) // 2
    title_y = start_y

    # Position for subtitle
    subtitle_x = left + (square_size - subtitle_width) // 2
    subtitle_y = title_y + title_height + title_subtitle_spacing

    # Draw the text
    draw.multiline_text(
        (title_x, title_y),
        title,
        font=title_font,
        fill="white",
        align="center"
    )

    draw.multiline_text(
        (subtitle_x, subtitle_y),
        subtitle,
        font=subtitle_font,
        fill="white",
        align="center"
    )

    # Merge overlay with background and save
    final_image = Image.alpha_composite(background, overlay)
    final_image.save(output_path)


# from anthropic import Anthropic

# def translate_text(input_string, target_language):
#     """Translate text using Anthropic's Claude."""
#     try:
#         # Initialize the Anthropic client
#         anthropic = Anthropic(api_key=ANTHROPIC_API_KEY)  # Replace with your Anthropic API key
        
#         # Prepare the prompt
#         prompt = f"\n\nHuman: Translate the following text to {target_language}: {input_string}\n\nAssistant:"
        
#         # Call the Anthropic API for translation
#         response = anthropic.messages.create(
#             model="claude-3-opus-20240229",  # or "claude-2.1" depending on your needs
#             max_tokens=1000,
#             messages=[{
#                 "role": "user",
#                 "content": f"Translate the following text to {target_language}: {input_string}. if already in {target_language}, return {input_string} as it is. No Preamble"
#             }]
#         )
        
#         # Extract the translated text from the response
#         translated_text = response.content[0].text.strip()
#         return translated_text
        
#     except Exception as e:
#         print(f"Translation error: {str(e)}")
#         return input_string  # Return original string if translation fails
def translate_text(input_string: str, target_language: str, max_length=30) -> str:
    """
    Use Perplexity to detect language and translate if needed.
    Returns original text if source and target languages match.
    """
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # First, detect the language
    detect_payload = {
        "model": "llama-3.1-sonar-small-128k-online",
        "messages": [
            {
                "role": "system",
                "content": "You are a language detector. Respond with only the language . one word output. No additional text. without any special charaters including fullstop."
            },
            {
                "role": "user",
                "content": f"What language is this text in: {input_string}"
            }
        ]
    }

    try:
        detect_response = requests.post(url, json=detect_payload, headers=headers)
        detect_response.raise_for_status()
        source_language = detect_response.json()['choices'][0]['message']['content'].strip().lower()

        # If source and target languages match, return original
        if source_language == target_language.lower():
            return input_string

        # If languages differ, translate
        translate_payload = {
            "model": "llama-3.1-sonar-small-128k-online",
            "messages": [
                {
                    "role": "system",
                    "content": (
                        f"You are a translator, a very lazy translator, who just translates exact given input strings, not additional responses. Translate the following text to {target_language}. "
                        "Provide ONLY the direct translation of the exact text. "
                        "Do not add any explanations or additional content. "
                        "Do not expand or elaborate on the original text."
                    )
                },
                {
                    "role": "user",
                    "content": f"Translate this text exactly as is: {input_string}. Don't use explain the text. respond strictly in {target_language}, keep your response strictly under {max_length} characters."
                }
            ]
        }

        translate_response = requests.post(url, json=translate_payload, headers=headers)
        translate_response.raise_for_status()
        return translate_response.json()['choices'][0]['message']['content'].strip()

    except Exception as e:
        print(f"Error: {e}")
        return input_string

def beautify(input_string, language ,max_length=15):

    input_string=translate_text(input_string,language)
    words = input_string.split()  # Split the string into words
    lines = []  # To hold the final lines
    current_line = []  # To construct the current line
    
    for word in words:
        # If adding the next word exceeds the max_length, finalize the current line
        if sum(len(w) for w in current_line) + len(current_line) + len(word) > max_length:
            lines.append(" ".join(current_line).title())  # Capitalize each word in the line
            current_line = [word]  # Start a new line with the current word
        else:
            current_line.append(word)
    
    # Append the last line if it's not empty
    if current_line:
        lines.append(" ".join(current_line).title())  # Capitalize each word in the line
    
    return "\n".join(lines)  # Join the lines with \n

def extract_and_format_number(input_string):
    import re

    # Extract the leading number using regex
    match = re.match(r"(\d+)", input_string)
    if match:
        num = int(match.group(1))  # Extract the number and convert to integer
        formatted = f"{num}-{num + 1}"
        return formatted, num
    else:
        raise ValueError("The input string does not start with a number.")
    
from bs4 import BeautifulSoup, Comment

def extract_post_details(html_file_path):
    """
    Extract title, meta, and HTML content from a file.
    Returns a dictionary containing these details along with a generated slug.
    """
    try:
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract title
        title_tag = soup.title
        title = title_tag.string.strip() if title_tag else "No title found"

        # Generate slug from title
        slug = title.lower().replace(' ', '-')
        slug = ''.join(c for c in slug if c.isalnum() or c == '-')

        # Extract meta content
        meta_contents = " ".join([
            tag.get("content", "")
            for tag in soup.find_all("meta")
            if tag.get("content") and "width=device-width" not in tag.get("content", "")
        ])

        # Extract HTML content using the original method
        start_index = html_content.find("<!-- wp:heading -->")
        if start_index == -1:
            raise ValueError("The starting tag '<!-- wp:heading -->' was not found.")

        end_index = html_content.find("</html>", start_index)
        if end_index == -1:
            end_index = len(html_content)

        html_snippet = html_content[start_index:end_index]

        return {
            "title": title,
            "slug": slug,
            "meta": meta_contents,
            "content": html_snippet
        }

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return {
            "title": "",
            "slug": "",
            "meta": "",
            "content": ""
        }

def create_zip_file(directory):
    memory_file = BytesIO()
    with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, directory)
                zipf.write(file_path, arcname)
    memory_file.seek(0)
    return memory_file
def save_html_details_to_file(details: Dict, language: str, title: str, output_file: str):
    """Save HTML details to a text file with proper formatting."""
    with open(output_file, 'a', encoding='utf-8') as file:
        file.write(f"\n{'='*80}\n")
        file.write(f"Language: {language}\n")
        file.write(f"Title: {title}\n")
        file.write(f"{'='*80}\n\n")
        
        file.write("META CONTENTS:\n")
        file.write(f"{'-'*50}\n")
        file.write(f"{details['meta_contents']}\n\n")
        
        file.write("HTML SNIPPET:\n")
        file.write(f"{'-'*50}\n")
        file.write(f"{details['html_snippet']}\n\n")
        
        file.write(f"{'#'*80}\n\n")

# def main():
#     course_df = pd.read_csv('course_data_full.csv')
#     #### get title,subtitle, course_code from 


#     title = beautify(title,language)

#     subtitle = beautify(subtitle,language)
#     course_df=pd.read_csv('course_data_full.csv')
#     course_dict=course_df.loc[(course_df.course_code==course_code) & (course_df.language==language)].iloc[0].to_dict()

#     website=course_dict['website']
#     course_name=course_dict['course_name']
#     course_link=course_dict['course_link']
#     results_blog_dict=get_random_values_from_string(course_dict['result_blog_dict'],4)
#     results_product_dict=get_random_values_from_string(course_dict['results_product_dict'],4)
#     inpiration_blog_dict=get_random_values_from_string(course_dict['inspiration_blog_dict'],4)

#     num_of_words_in_blog,num_of_images_in_blog,num_of_headings,blog_percent,promotion_percent,num_of_infographics,user_notes_for_image,number_of_case_studies,number_of_tables,num_of_outbound_links=fetch_blog_specifications()
#     num_of_images_in_blog,num_of_headings=extract_and_format_number(title)

#     blog_identifier=GenerateBlogStructure(title,
#                                         course_name,
#                                         course_link,
#                                         results_blog_dict,
#                                         results_product_dict,
#                                         num_of_words_in_blog,
#                                         num_of_images_in_blog,
#                                         num_of_headings,
#                                         blog_percent,
#                                         promotion_percent,
#                                         num_of_infographics,
#                                         number_of_case_studies,
#                                         inpiration_blog_dict,
#                                         number_of_tables,
#                                         num_of_outbound_links,language)

#     SaveImages(blog_identifier)
#     ProduceFinalOutput(blog_identifier,website)

#     with open(f'Blogs/Organization/json/{blog_identifier}.json', "r") as file:
#         data = json.load(file)

#     Content_id=list(data.keys())[0]


#     input_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'
#     output_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}-header.png'



#     try:
#         result_image = create_simple_overlay(
#             image_path=input_path,
#             output_path=output_path,
#             title=title,
#             subtitle=subtitle  # Added subtitle parameter
#         )
#         print("Image created successfully!")
#     except Exception as e:
#         print(f"Error creating image: {str(e)}")

#     html_file_path = f"Blogs/Organization/html/final_output/{blog_identifier}.html"
#     details = extract_html_details(html_file_path)
#     print("Title:", details["title"])
#     print("Meta Contents:", details["meta_contents"])
#     print("HTML Snippet:", details["html_snippet"])

# if __name__ == "__main__":
#     main()

import requests
import os
import random

def find_relevant_tags(blog_title: str, tag_list: list) -> list:
    """Find 2-3 most relevant tags from a randomly sorted list for a given blog title."""
    
    # Create a copy of the tag list and shuffle it
    shuffled_tags = tag_list.copy()
    random.shuffle(shuffled_tags)
    
    # Get API key from environment variable
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    # API endpoint
    url = "https://api.perplexity.ai/chat/completions"

    # Headers with API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Create the prompt with shuffled tags
    prompt = f"""Given the blog title: "{blog_title}"
    And this list of tags: {', '.join(shuffled_tags)}
    Return only 2-3 most relevant tags that best match the blog title's topic.
    Provide the response as a comma-separated list without explanations."""

    # Payload for the API
    payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": "You are a content categorization expert. Provide direct, concise responses with only the most relevant tags."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    try:
        # Make the API request
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()

        # Extract the suggested tags
        result = response.json()
        suggested_tags = result['choices'][0]['message']['content'].strip()
        
        # Convert the comma-separated string to list and clean up
        tag_recommendations = [tag.strip() for tag in suggested_tags.split(',')]
        
        # Ensure we return only tags that exist in the original tag_list
        final_tags = [tag for tag in tag_recommendations if tag in tag_list][:3]
        
        return final_tags

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the Perplexity API: {e}")
        return []
    except KeyError as e:
        print(f"Unexpected response structure: {e}")
        return []

import requests
from requests.auth import HTTPBasicAuth
import os


def upload_image(image_path):
    headers = {
        "Authorization": f"Basic {username}:{password}"
    }
    with open(image_path, "rb") as img_file:
        files = {"file": img_file}
        response = requests.post(
            f"{site_url}/media",
            auth=HTTPBasicAuth(username, password),
            files=files
        )
    if response.status_code == 201:
        return response.json().get("id")
    else:
        print("Failed to upload image:", response.json())
        return None

def get_or_create_tags(tags_list):
    tag_ids = []
    for tag in tags_list:
        response = requests.get(
            f"{site_url}/tags?search={tag}",
            auth=HTTPBasicAuth(username, password)
        )
        if response.status_code == 200 and response.json():
            tag_ids.append(response.json()[0]["id"])
        else:
            response = requests.post(
                f"{site_url}/tags",
                auth=HTTPBasicAuth(username, password),
                json={"name": tag}
            )
            if response.status_code == 201:
                tag_ids.append(response.json()["id"])
    return tag_ids

def get_or_create_categories(categories_list):
    category_ids = []
    for category in categories_list:
        response = requests.get(
            f"{site_url}/categories?search={category}",
            auth=HTTPBasicAuth(username, password)
        )
        if response.status_code == 200 and response.json():
            category_ids.append(response.json()[0]["id"])
        else:
            response = requests.post(
                f"{site_url}/categories",
                auth=HTTPBasicAuth(username, password),
                json={"name": category}
            )
            if response.status_code == 201:
                category_ids.append(response.json()["id"])
    return category_ids

def get_user_id_by_username(username):
    response = requests.get(
        f"{site_url}/users?search={username}",
        auth=HTTPBasicAuth(username, password)
    )
    if response.status_code == 200:
        users = response.json()
        if users:
            return users[0]['id']
    return 1  # Default to user ID 1 if not found

def create_wordpress_post(details, blog_identifier, Tags, Category):
    print("creating word press post")
    print(details)
    # Get post details from the details dictionary
    post_title = details['title']
    html_content = details['content']
    post_slug = details['slug']
    excerpt = details['meta']

    print("details extracted")

    # Prepare Rank Math metadata
    rank_math_meta = {
        "rank_math_title": post_title,
        "rank_math_focus_keyword": ", ".join(Tags),  # Using tags as keywords
        "rank_math_permalink": post_slug
    }

    print("rank extracted")

    # Upload images and find header image
    image_ids = []
    featured_image_id = None
    image_dir = f'Blogs/Organization/images/{blog_identifier}'
    
    if os.path.exists(image_dir):
        for image_file in os.listdir(image_dir):
            if image_file.endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(image_dir, image_file)
                image_id = upload_image(image_path)
                
                if image_id:
                    # If the image filename contains 'header', set it as featured image
                    if 'header' in image_file.lower():
                        featured_image_id = image_id
                    image_ids.append(image_id)


    print("header image found")

    # Prepare the post data
    author_id = 3

    # Prepare the post data
    post_data = {
        "title": post_title,
        "content": html_content,
        "slug": post_slug,
        "status": "draft",
        "excerpt": excerpt,
        "author": author_id,  # Using the fetched user ID
        "tags": get_or_create_tags(Tags),
        "categories": get_or_create_categories(Category),
        "meta": rank_math_meta,
    }
    # Set featured image if header image was found
    if featured_image_id:
        post_data["featured_media"] = featured_image_id
    elif image_ids:  # Fallback to first image if no header image found
        post_data["featured_media"] = image_ids[0]


    print("post data created")

    # Create the post
    response = requests.post(
        f"{site_url}/posts",
        auth=HTTPBasicAuth(username, password),
        json=post_data
    )

    print("posting done")

    if response.status_code == 201:
        print("Post created successfully!")
        print("Post ID:", response.json().get("id"))
        if featured_image_id:
            print("Header image set as thumbnail successfully!")
        else:
            print("Warning: No header image found to set as thumbnail")
    else:
        print("Failed to create post:", response.json())

def title_check_func(blog_identifier):
    with open(f'Blogs/Organization/html/blog_structure/{blog_identifier}.html', "r", encoding="utf-8") as file:
            html_content = file.read()
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract title
    title_tag = soup.title
    title_check = title_tag.string.strip() if title_tag else "No title found"

    if title_check=="No title found":
        return True
    else:
        return False

def process_topic(topic: Dict, language: str, max_retry=3):
    """Process a single topic for a specific language."""
    try:
        course_df = pd.read_csv('course_data_full.csv')
        
        title = beautify(topic['title'], language)
        subtitle = beautify(topic['subtitle'], language)
        subtitle = re.sub(r'\s*\(.*?\)', '', subtitle)
        course_code = topic['course_code']

        course_dict = course_df.loc[
            (course_df.course_code == course_code) & 
            (course_df.language == language)
        ].iloc[0].to_dict()
        # WordPress site details



        website = course_dict['website']
        course_name = course_dict['course_name']
        course_link = course_dict['course_link']

        Category = course_dict['Category'].split('|')
        Tags = course_dict['Tags'].split('|')

        Tags=find_relevant_tags(title,Tags)
        
        results_blog_dict = get_random_values_from_string(course_dict['result_blog_dict'], 4)
        results_product_dict = get_random_values_from_string(course_dict['results_product_dict'], 4)
        inpiration_blog_dict = get_random_values_from_string(course_dict['inspiration_blog_dict'], 4)

        num_of_words_in_blog,num_of_images_in_blog,num_of_headings,blog_percent,promotion_percent,num_of_infographics,user_notes_for_image,number_of_case_studies,number_of_tables,num_of_outbound_links=fetch_blog_specifications()
        num_of_images_in_blog, num_of_headings = extract_and_format_number(title)

        global site_url, username, password
        site_url = f"https://{website}.com/wp-json/wp/v2"
        username = "rishabh.deepak.shukla@gmail.com"
        

        # reading the data from the file 
        with open('password.txt') as f: 
            data = f.read() 

        passwords= ast.literal_eval(data) 
        password = passwords[language]

        blog_identifier=GenerateBlogStructure(title,
                                            course_name,
                                            course_link,
                                            results_blog_dict,
                                            results_product_dict,
                                            num_of_words_in_blog,
                                            num_of_images_in_blog,
                                            num_of_headings,
                                            blog_percent,
                                            promotion_percent,
                                            num_of_infographics,
                                            number_of_case_studies,
                                            inpiration_blog_dict,
                                            number_of_tables,
                                            num_of_outbound_links,language)
        

        count=0
        while title_check_func(blog_identifier):
            
            if count==max_retry:
                raise f"unable to create blog for  {title} in {language}"
            print(f"title not found retrying ({count+1})..")
            count+=1
            blog_identifier=GenerateBlogStructure(title,
                                    course_name,
                                    course_link,
                                    results_blog_dict,
                                    results_product_dict,
                                    num_of_words_in_blog,
                                    num_of_images_in_blog,
                                    num_of_headings,
                                    blog_percent,
                                    promotion_percent,
                                    num_of_infographics,
                                    number_of_case_studies,
                                    inpiration_blog_dict,
                                    number_of_tables,
                                    num_of_outbound_links,language)

        SaveImages(blog_identifier)
        ProduceFinalOutput(blog_identifier, website)

        # Process the blog JSON
        json_path = f'Blogs/Organization/json/{blog_identifier}.json'
        with open(json_path, "r") as file:
            data = json.load(file)

        Content_id = list(data.values())[0].split('/')[-1].split('.')[0]

        # Process images
        input_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'
        output_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}-header.png'



        try:
            create_simple_overlay(
                image_path=input_path,
                output_path=output_path,
                title=title,
                subtitle=subtitle
            )
            print(f"Image created successfully for {language} - {title}")
        except Exception as e:
            print(f"Error creating image for {language} - {title}: {str(e)}")


        # Extract HTML details
        html_file_path = f"Blogs/Organization/html/final_output/{blog_identifier}.html"
        details = extract_post_details(html_file_path)

        print("details found")
        
        # Save details to file
        # save_html_details_to_file(details, language, title, output_file)
        
        create_wordpress_post(details, blog_identifier, Tags, Category)
        
        print(f"Processed {language} - {title}")
        
    except Exception as e:
        print(f"Error processing {language} - {title}: {str(e)}")

# def process_topic(topic: Dict, language: str, output_file: str):
#     """Process a single topic for a specific language."""
#     try:
#         course_df = pd.read_csv('course_data_full.csv')
        
#         title = beautify(topic['title'], language)
#         subtitle = beautify(topic['subtitle'], language)
#         course_code = topic['course_code']

#         course_dict = course_df.loc[
#             (course_df.course_code == course_code) & 
#             (course_df.language == language)
#         ].iloc[0].to_dict()

#         website = course_dict['website']
#         course_name = course_dict['course_name']
#         course_link = course_dict['course_link']
        
#         results_blog_dict = get_random_values_from_string(course_dict['result_blog_dict'], 4)
#         results_product_dict = get_random_values_from_string(course_dict['results_product_dict'], 4)
#         inpiration_blog_dict = get_random_values_from_string(course_dict['inspiration_blog_dict'], 4)

#         num_of_words_in_blog,num_of_images_in_blog,num_of_headings,blog_percent,promotion_percent,num_of_infographics,user_notes_for_image,number_of_case_studies,number_of_tables,num_of_outbound_links=fetch_blog_specifications()
#         num_of_images_in_blog, num_of_headings = extract_and_format_number(title)

#         blog_identifier=GenerateBlogStructure(title,
#                                             course_name,
#                                             course_link,
#                                             results_blog_dict,
#                                             results_product_dict,
#                                             num_of_words_in_blog,
#                                             num_of_images_in_blog,
#                                             num_of_headings,
#                                             blog_percent,
#                                             promotion_percent,
#                                             num_of_infographics,
#                                             number_of_case_studies,
#                                             inpiration_blog_dict,
#                                             number_of_tables,
#                                             num_of_outbound_links,language)

#         SaveImages(blog_identifier)
#         ProduceFinalOutput(blog_identifier, website)

#         # Process the blog JSON
#         json_path = f'Blogs/Organization/json/{blog_identifier}.json'
#         with open(json_path, "r") as file:
#             data = json.load(file)

#         Content_id = list(data.keys())[0]

#         # Process images
#         input_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'
#         output_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}-header.png'

#         try:
#             create_simple_overlay(
#                 image_path=input_path,
#                 output_path=output_path,
#                 title=title,
#                 subtitle=subtitle
#             )
#             print(f"Image created successfully for {language} - {title}")
#         except Exception as e:
#             print(f"Error creating image for {language} - {title}: {str(e)}")

#         # Extract HTML details
#         html_file_path = f"Blogs/Organization/html/final_output/{blog_identifier}.html"
#         details = extract_html_details(html_file_path)
        
#         # Save details to file
#         save_html_details_to_file(details, language, title, output_file)
#         print(f"Processed {language} - {title}")
        
#     except Exception as e:
#         print(f"Error processing {language} - {title}: {str(e)}")
import requests
import os

def generate_subtitle(blog_title: str, language:str, length=20) -> str:
    """Generate a subtitle for a given blog title using Perplexity AI."""
    
    # Get API key from environment variable
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    # API endpoint
    url = "https://api.perplexity.ai/chat/completions"

    # Headers with API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Payload for subtitle generation
    payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": (
                    f"you are a expert subtitle creator in {language}"
                    "Generate a concise and engaging subtitle for the given blog title. "
                    "The subtitle should be shorter than the title and complement its meaning. "
                    f"length of title is {len(blog_title)} hence subtile should less than {len(blog_title)}. ultimately less than 30 characters regardeless of the title length"
                    "Provide only the subtitle without any explanations.No preamble,No special characters to be returned including (\",:,full stop)"
                )
            },
            {
                "role": "user",
                "content": f"Generate a subtitle for this blog title: {blog_title}. the length of subtitle should be at max {length} characters which includes spaces."
            }
        ]
    }

    try:
        # Make the API request
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()

        # Extract the generated subtitle
        result = response.json()
        subtitle = result['choices'][0]['message']['content'].strip()

            
        return subtitle

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the Perplexity API: {e}")
        return "Error generating subtitle"
    except KeyError as e:
        print(f"Unexpected response structure: {e}")
        return "Error generating subtitle"

def main():
    """Main function to process all languages and topics."""
    # Create output file name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"html_details_{timestamp}.txt"
    
    # Add header to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("HTML DETAILS EXTRACTION REPORT\n")
        file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        file.write(f"{'='*80}\n\n")
    
    # Read input file
    input_data = read_input_file('input_file.txt')
    
    # Process each language and topic combination
    for language in input_data['language']:
        for topic in input_data['topic']:
            print(f"\nProcessing {language} - {topic['title']}")
            topic['subtitle']=generate_subtitle(topic['title'],language).replace("\"","")
            process_topic(topic, language, output_file)
    
    print(f"\nOutput has been saved to: {output_file}")

if __name__ == "__main__":
    main()


Processing German - 9 Protein-Rich Foods for Vegetarians


KeyboardInterrupt: 

In [1]:
import concurrent.futures
from datetime import datetime
import pandas as pd
import json
import ast
import re

def process_combination(topic, language, output_file):
    """Wrapper function to process a single topic-language combination."""
    print(f"\nProcessing {language} - {topic['title']}")
    topic['subtitle'] = generate_subtitle(topic['title'], language).replace("\"", "")
    process_topic(topic, language, output_file)

def main():
    """Main function to process all languages and topics in parallel."""
    # Create output file name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"html_details_{timestamp}.txt"
    
    # Add header to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("HTML DETAILS EXTRACTION REPORT\n")
        file.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        file.write(f"{'='*80}\n\n")
    
    # Read input file
    input_data = read_input_file('input_file.txt')

    # Using ThreadPoolExecutor for parallel execution
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for language in input_data['language']:
            for topic in input_data['topic']:
                futures.append(executor.submit(process_combination, topic, language, output_file))

        # Wait for all threads to complete
        concurrent.futures.wait(futures)

    print(f"\nOutput has been saved to: {output_file}")

if __name__ == "__main__":
    main()


NameError: name 'read_input_file' is not defined

In [4]:
translate_text("Transforming Pain into Art","italian")

'Transforming Pain into Art'

In [22]:
import os
import openai

def chunk_text(text, max_chunk_size=4000):
    """Splits a large text into smaller chunks, ensuring they fit within token limits."""
    chunks = []
    while len(text) > max_chunk_size:
        split_index = text[:max_chunk_size].rfind("\n")  # Try to split at a newline
        if split_index == -1:  # If no newline is found, split at max_chunk_size
            split_index = max_chunk_size
        chunks.append(text[:split_index])
        text = text[split_index:].strip()
    chunks.append(text)  # Append the remaining part
    return chunks

def fix_html_chunk(chunk):
    """Fixes syntax errors in a single chunk of HTML using OpenAI's GPT-4o."""
    
    prompt = f"""Your task is to **fix ONLY syntax errors** in the following HTML while keeping its content, structure, and length nearly identical.

### **INSTRUCTIONS:**
- **Fix ONLY syntax or structural errors.** Do NOT modify, rewrite, or interpret content.
- **Preserve all text, comments, and WordPress-specific tags** (e.g., `<!-- wp:* -->`).
- **Maintain the original formatting, spacing, and indentation.**  
- **Ensure the length is within ±5%** of the original.  
- **Do NOT add explanations, comments, or extra content.**  
- **make sure that if the links are there with section \"Explore more\" or \"References\" in any language english,spanish,italian etc has proper closing **
- **Return ONLY the fully corrected HTML with no preamble or notes.**

### **HTML TO FIX**
{chunk}


**EXPECTED OUTPUT:**  
Return ONLY the corrected HTML **inside triple backticks** (```) with no additional text.

(fixed HTML here)

"""

    try:
        response = openai.ChatCompletion.create(
            model="chatgpt-4o-latest",
            messages=[{"role": "system", "content": "You are an expert in fixing HTML syntax errors."},
                      {"role": "user", "content": prompt}],
            max_tokens=4096,
            temperature=0.1
        )

        # Extract and clean the corrected HTML
        raw_output = response['choices'][0]['message']['content'].strip()
        fixed_html = raw_output.strip("```")  # Remove surrounding backticks if present
        return fixed_html

    except Exception as e:
        print(f"❌ Error occurred: {e}")
        return chunk  # Return original chunk if an error occurs

def fix_html(input_file: str, output_file: str):
    """Reads an HTML file, processes it in chunks, and writes the corrected HTML to an output file."""
    
    # Load API key
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found. Please set the environment variable.")

    openai.api_key = api_key

    # Read input HTML content
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            html_content = file.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"File '{input_file}' not found.")

    # Split HTML into manageable chunks
    chunks = chunk_text(html_content)

    print(f"🔹 Processing {len(chunks)} chunks...")

    # Process each chunk
    fixed_chunks = [fix_html_chunk(chunk) for chunk in chunks]

    # Save the corrected HTML
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("\n".join(fixed_chunks))

    print(f"✅ HTML fixed successfully. Output saved to '{output_file}'")

# Run if executed as a script
if __name__ == "__main__":
    fix_html("html_details", "fixed_html.html")



🔹 Processing 11 chunks...
✅ HTML fixed successfully. Output saved to 'fixed_html.html'


In [2]:
import os
import openai

# Set your OpenAI API key
ANTHROPIC_API_KEY=os.getenv('ANTHROPIC_API_KEY')  
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  
PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY')   
FAL_KEY = os.getenv('FAL_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  
openai.api_key = os.environ.get("OPENAI_API_KEY")

# Check if the API key is correctly loaded
if not openai.api_key:
    raise ValueError("API key not found. Please set the 'OPENAI_API_KEY' environment variable.")

# Example ChatCompletion request
try:
    while True:
        # Correcting the model name to a valid one such as "gpt-4" or "gpt-3.5-turbo"
        response = openai.ChatCompletion.create(
            model="chatgpt-4o-latest",  # Replace with the correct model name
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Hi"}
            ],
            max_tokens=200,  # Adjust as needed for your use case
            temperature=0.7
        )

        # Extract the generated content
        new_content = response['choices'][0]['message']['content']
        print("Generated response:", new_content)

        # Update or use the content (example with a loop)
        previous_html = new_content
        print("Updated HTML:", previous_html)

except Exception as e:
    print(f"Error occurred: {e}")


Generated response: Hello! How can I assist you today? 😊
Updated HTML: Hello! How can I assist you today? 😊
Generated response: Hello! How can I assist you today? 😊
Updated HTML: Hello! How can I assist you today? 😊
Generated response: Hello! How can I assist you today? 😊
Updated HTML: Hello! How can I assist you today? 😊
Generated response: Hello! How can I help you today? 😊
Updated HTML: Hello! How can I help you today? 😊
Generated response: Hello! How can I assist you today? 😊
Updated HTML: Hello! How can I assist you today? 😊


KeyboardInterrupt: 

In [5]:
!pip install openai==0.28



In [8]:
from openai import OpenAI
from dotenv import load_dotenv  
from Utilites import *
from runbook import *
import pandas as pd
load_dotenv() 
def call_openai_api(text):
    client = OpenAI(api_key=OPENAI_API_KEY)
    
    prompt = """Extract all sections and subsections from the provided text and structure them as follows:\nsection,subsections\n1,[1.1#1.2]\n1.1,[1.1.1#1.1.2]\n1.1.1,[]\n1.2,[1.2.1]\n...\nRules:\n1. Break down all sections into subsections until the last leaf node.\n2. A leaf node is a section or subsection with no further subdivisions.\n3. For each section, include its immediate child subsections as a list.\n4. Use empty brackets [] for sections with no subsections.\n NO PREAMBLE"""
    
    try:
        response = client.chat.completions.create(
            model="chatgpt-4o-latest",
            messages=[
                {"role": "system", "content": "You are an AI assistant that analyzes and organizes legal documents."},
                {"role": "user", "content": prompt},
                {"role": "user", "content": f"Text to analyze:\n{text}"},
            ],
            temperature=0,
            top_p=0.95,
            max_tokens=4000
        )
        
        return response.choices[0].message.content.strip()
    except Exception as e:
        raise Exception(f"OpenAI API request failed: {str(e)}")

call_openai_api("abc")


'The provided text does not contain any sections or subsections to analyze. Please provide a structured legal document or text with identifiable sections and subsections for extraction.'

In [17]:
def translate_text(input_string: str, target_language: str) -> str:
    """
    Use Perplexity to detect language and translate if needed.
    Returns original text if source and target languages match.
    """
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    url = "https://api.perplexity.ai/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # First, detect the language
    detect_payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": "You are a language detector. Respond with only the language . one word output. No additional text. without any special charaters including fullstop."
            },
            {
                "role": "user",
                "content": f"What language is this text in: {input_string}"
            }
        ]
    }

    try:
        detect_response = requests.post(url, json=detect_payload, headers=headers)
        detect_response.raise_for_status()
        source_language = detect_response.json()['choices'][0]['message']['content'].strip().lower()

        # If source and target languages match, return original
        if source_language == target_language.lower():
            return input_string

        # If languages differ, translate
        translate_payload = {
            "model": "llama-3.1-sonar-huge-128k-online",
            "messages": [
                {
                    "role": "system",
                    "content": (
                        f"You are a translator. Translate the following text to {target_language}. "
                        "Provide ONLY the direct translation of the exact text. "
                        "Do not add any explanations or additional content. "
                        "Do not expand or elaborate on the original text."
                    )
                },
                {
                    "role": "user",
                    "content": f"Translate this text exactly as is: {input_string}"
                }
            ]
        }

        translate_response = requests.post(url, json=translate_payload, headers=headers)
        translate_response.raise_for_status()
        return translate_response.json()['choices'][0]['message']['content'].strip()

    except Exception as e:
        print(f"Error: {e}")
        return input_string

# Example usage
if __name__ == "__main__":
    text = "7 ways to create Mind Body Connection for Holistic Wellness"
    result = translate_text(text, "English")
    print(f"Original: {text}")
    print(f"Translated: {result}")

english english
Original: 7 ways to create Mind Body Connection for Holistic Wellness
Translated: 7 ways to create Mind Body Connection for Holistic Wellness


In [64]:
def clean_translation(text: str) -> str:
    """Clean the translation output by removing unwanted patterns."""
    # Remove text within square brackets
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove any "Translation:" or similar prefixes
    text = re.sub(r'^(Translation:|Translated text:|Here\'s the translation:)\s*', '', text, flags=re.IGNORECASE)
    
    # Remove quotes if they wrap the entire text
    text = re.sub(r'^["\'](.*)["\']$', r'\1', text)
    
    # Remove any trailing or leading periods if they're not part of the original text
    text = text.strip('.')
    
    # Remove any extra spaces
    text = ' '.join(text.split())
    
    return text


def translate_text(input_string: str, target_language: str) -> str:
    """Translate text using Perplexity AI."""
    import os
    
    # Get API key from environment variable
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    # API endpoint
    url = "https://api.perplexity.ai/chat/completions"

    # Headers with API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Modified prompt to get cleaner responses
    payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": (
                    f"You are a translator to {target_language}. Provide direct translations "
                    "without explanations, citations, or additional formatting. Return only "
                    "the translated text."
                )
            },
            {
                "role": "user",
                "content": f"Translate this text: {input_string}"
            }
        ]
    }

    try:
        # Make the API request
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()

        # Extract and clean the translated text
        result = response.json()
        translated_text = result['choices'][0]['message']['content'].strip()
        cleaned_text = clean_translation(translated_text)
        
        return cleaned_text

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the Perplexity API: {e}")
        return input_string
    except KeyError as e:
        print(f"Unexpected response structure: {e}")
        return input_string
    
translate_text("Art Therapy Exercises|Creative Expression|Emotional Healing|Stress Relief|Mandala Coloring|Therapeutic Art","spanish")

'Ejercicios de terapia de arte | Expresión creativa | Sanación emocional | Alivio del estrés | Coloreado de mandalas | Arte terapéutico'

In [46]:


course_df = pd.read_csv('course_data_full.csv')

# title = beautify(topic['title'], language)
# subtitle = beautify(topic['subtitle'], language)
course_code = 'AT'
language='English'

course_dict = course_df.loc[
    (course_df.course_code == course_code) & 
    (course_df.language == language)
].iloc[0].to_dict()

website = course_dict['website']
course_name = course_dict['course_name']
course_link = course_dict['course_link']

results_blog_dict = get_random_values_from_string(course_dict['result_blog_dict'], 4)
results_blog_dict

{'Therapeutic Arts For Healing': 'https://scholistico.com/therapeutic-arts-for-healing/',
 'How To Price Non Clinical Art Therapy Services For Beginners': 'https://scholistico.com/how-to-price-non-clinical-art-therapy-services-for-beginners/',
 '10 Art Therapy Exercises For Trauma Recovery': 'https://scholistico.com/10-art-therapy-exercises-for-trauma-recovery/',
 'Art Therapy Activities Stress Reducation': 'https://scholistico.com/art-therapy-activities-stress-reducation/'}

In [4]:
!pip install openai==1.0.0

Collecting openai==1.0.0
  Downloading openai-1.0.0-py3-none-any.whl.metadata (16 kB)
Collecting anyio<4,>=3.5.0 (from openai==1.0.0)
  Downloading anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)
Downloading openai-1.0.0-py3-none-any.whl (154 kB)
Downloading anyio-3.7.1-py3-none-any.whl (80 kB)
Installing collected packages: anyio, openai
  Attempting uninstall: anyio
    Found existing installation: anyio 4.2.0
    Uninstalling anyio-4.2.0:
      Successfully uninstalled anyio-4.2.0
  Attempting uninstall: openai
    Found existing installation: openai 1.58.1
    Uninstalling openai-1.58.1:
      Successfully uninstalled openai-1.58.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.2.10 requires openai<2.0.0,>=1.54.0, but you have openai 1.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed anyio-3.7.1 openai-1.0.0


In [77]:


# Example usage
if __name__ == "__main__":
    # Example blog titles
    test_titles = [
        "The Future of Artificial Intelligence in Healthcare",
        "10 Essential Tips for Remote Work Success",
        "Understanding Climate Change: A Comprehensive Guide"
    ]
    
    for title in test_titles:
        subtitle = generate_subtitle(title)
        print(f"\nTitle: {title}")
        print(f"Subtitle: {subtitle}")


Title: The Future of Artificial Intelligence in Healthcare
Subtitle: Transforming Patient Care and Medical Research with Advanced AI Solutions

Title: 10 Essential Tips for Remote Work Success
Subtitle: Mastering the Art of Working from Anywhere

Title: Understanding Climate Change: A Comprehensive Guide
Subtitle: Causes, Effects, and Solutions for a Sustainable Future


In [19]:
translate_text("Loose Fat At Home","Spanish")

'Loose Fat At Home'

In [7]:

title="5 köstliche \nRezepte, die Sie\n diesen Winter\n warmhalten"


subtitle="Rezepte, \ndie Sie wärmen"

blog_identifier='985e7b40'

json_path = f'Blogs/Organization/json/{blog_identifier}.json'
with open(json_path, "r") as file:
        data = json.load(file)

Content_id = list(data.keys())[0]

input_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'
output_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}-header2.png'

create_simple_overlay(input_path, output_path, title, subtitle)

In [38]:
def get_user_id_by_username(username):
    # Construct the proper WordPress REST API endpoint
    api_url = "https://scholistico.com/wp-json/wp/v2/users"
    
    # Add parameters for search
    params = {
        'search': username,
        'per_page': 1
    }
    
    headers = {
        'Accept': 'application/json'
    }
    
    try:
        response = requests.get(
            api_url,
            params=params,
            headers=headers
        )
        
        print(f"URL being called: {response.url}")
        print(f"Status Code: {response.status_code}")
        
        if response.status_code == 200:
            users = response.json()
            if users:
                return users[0]['id']
            else:
                print("No users found")
        else:
            print(f"Error: Received status code {response.status_code}")
            print(f"Response Text: {response.text[:200]}...")
            
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
    except ValueError as e:
        print(f"JSON parsing error: {e}")
        
    return 1
get_user_id_by_username("Scholistico")

URL being called: https://scholistico.com
Status Code: 200
Request error: Expecting value: line 1 column 1 (char 0)


1

'6dL5 IAAr g1jr ta7K buMM VcMM'

In [23]:
generate_subtitle("8 Natürliche Heilmittel Für Gelenkschmerzen","German")

'Natürliche Linderung für schmerzende Gelenke.'

In [22]:


import json
import os

# Create a dictionary mapping titles to their identifiers for each language
translations = {
    # Spanish translations
    "e40beaac": {
        "title": beautify("8 Natural Approaches to Managing Chronic Pain", "German"),
        "subtitle": beautify(generate_subtitle("8 Natural Approaches to Managing Chronic Pain", "German"), "German")
    }
}

def process_blog_images(blog_identifier):
    try:
        # Get the translation data for this identifier
        if blog_identifier not in translations:
            print(f"No translation found for identifier: {blog_identifier}")
            return
        
        translation = translations[blog_identifier]

        
        json_path = f'Blogs/Organization/json/{blog_identifier}.json'
        with open(json_path, "r") as file:
            data = json.load(file)

        Content_id = list(data.values())[0].split('/')[-1].split('.')[0]

        # Process images
        input_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}.png'
        output_path = f'Blogs/Organization/images/{blog_identifier}/{Content_id}-header.png'
        
        # Create the overlay
        create_simple_overlay(
            input_path, 
            output_path, 
            translation["title"].replace("\"",""), 
            translation["subtitle"].replace("\"","")
        )
        
        print(f"Successfully processed {blog_identifier}")
        
    except Exception as e:
        print(f"Error processing {blog_identifier}: {str(e)}")

# List of all blog identifiers
blog_identifiers = [
    "e40beaac"
]

# Process all blogs
def process_all_blogs():
    for identifier in blog_identifiers:
        print(f"\nProcessing {identifier}...")
        process_blog_images(identifier)

# Run the processing
if __name__ == "__main__":
    # Ensure the create_simple_overlay function is imported
    try:
        process_all_blogs()
    except Exception as e:
        print(f"Error in main execution: {str(e)}")


Processing e40beaac...
Successfully processed e40beaac


In [52]:
beautify("Moon is wife of sun","Spanish")

'La Luna Es La\nEsposa Del Sol.'

In [32]:
import requests
from requests.auth import HTTPBasicAuth

# WordPress site details
site_url = "https://scholistico.com/wp-json/wp/v2"
username = "rishabh.deepak.shukla@gmail.com"
password = "gUVF aLy4 3c1j 63gP jEKS rykg"

# Post details
post_title = "Your Blog Post Title"
html_content = "<p>This is your blog content with HTML formatting.</p>"
post_slug = "your-blog-post-slug"
tags = ["Python", "Automation", "WordPress"]  # List of tags
categories = ["Programming", "Tutorials"]    # List of categories
rank_math_meta = {
    "rank_math_title": "Your Rank Math Optimized Title",
    "rank_math_focus_keyword": "keyword1, keyword2",
    "rank_math_permalink": post_slug
}

# Upload an image and get its media ID
def upload_image(image_path):
    headers = {
        "Authorization": f"Basic {username}:{password}"
    }
    with open(image_path, "rb") as img_file:
        files = {"file": img_file}
        response = requests.post(
            f"{site_url}/media",
            auth=HTTPBasicAuth(username, password),
            files=files
        )
    if response.status_code == 201:
        return response.json().get("id")
    else:
        print("Failed to upload image:", response.json())
        return None

# Fetch tag IDs or create them
def get_or_create_tags(tags_list):
    tag_ids = []
    for tag in tags_list:
        response = requests.get(
            f"{site_url}/tags?search={tag}",
            auth=HTTPBasicAuth(username, password)
        )
        if response.status_code == 200 and response.json():
            tag_ids.append(response.json()[0]["id"])
        else:
            # Create the tag if it doesn't exist
            response = requests.post(
                f"{site_url}/tags",
                auth=HTTPBasicAuth(username, password),
                json={"name": tag}
            )
            if response.status_code == 201:
                tag_ids.append(response.json()["id"])
    return tag_ids

# Fetch category IDs or create them
def get_or_create_categories(categories_list):
    category_ids = []
    for category in categories_list:
        response = requests.get(
            f"{site_url}/categories?search={category}",
            auth=HTTPBasicAuth(username, password)
        )
        if response.status_code == 200 and response.json():
            category_ids.append(response.json()[0]["id"])
        else:
            # Create the category if it doesn't exist
            response = requests.post(
                f"{site_url}/categories",
                auth=HTTPBasicAuth(username, password),
                json={"name": category}
            )
            if response.status_code == 201:
                category_ids.append(response.json()["id"])
    return category_ids

# Main function to create a post
def create_post():
    # Upload the featured image (optional)
    image_path = "Blogs/Organization/images/ffeeaadf/content-8b074aac.png"
    featured_image_id = upload_image(image_path)

    # Prepare the post data
    post_data = {
        "title": post_title,
        "content": html_content,
        "slug": post_slug,
        "status": "draft",
        "tags": get_or_create_tags(tags),
        "categories": get_or_create_categories(categories),
        "meta": rank_math_meta,  # Rank Math SEO metadata
    }

    if featured_image_id:
        post_data["featured_media"] = featured_image_id

    # Create the post
    response = requests.post(
        f"{site_url}/posts",
        auth=HTTPBasicAuth(username, password),
        json=post_data
    )

    if response.status_code == 201:
        print("Post created successfully!")
        print("Post ID:", response.json().get("id"))
    else:
        print("Failed to create post:", response.json())

# Run the script
create_post()


Post created successfully!
Post ID: 158587


In [65]:
course_df = pd.read_csv('course_data_full.csv')

In [39]:
course_df['tags']=

0                              Art Therapy Practitioner
1                       Kunstzinnige Therapie Opleiding
2                    Corso Di Formazione In Arteterapia
3                           Ausbildung In Kunsttherapie
4           Curso Certificacion Practicante Arteterapia
5              Naturopathy Practitioner Training Course
6                    Natuurgeneeskunde Opleiding Online
7      Corso Di Formazione Per Operatori Di Naturopatia
8                     Ausbildung Zum Naturheilpraktiker
9           Curso Certificacion Practicante Naturopatia
10        Holistic Nutrition Consultant Training Course
11              Holistische Voedingsconsulent Opleiding
12                Certificazione In Nutrizione Olistica
13     Ausbildung Zum Ganzheitlichen Ernaehrungsberater
14    Curso De Certificacion Consultor En Nutricion ...
15    Holistic Health Practitioner Online Training C...
16                    Holistische Gezondheidsdeskundige
17       Certificazione In Operatore Sanitario O

In [42]:
course_df

Unnamed: 0,course_code,language,country_code,website,course_name,course_link,result_blog_dict,results_product_dict,inspiration_blog_dict,course_document_path
0,AT,English,en,scholistico,Art Therapy Practitioner,https://scholistico.com/product/art-therapy-pr...,"{\n ""9-art-therapy-exercises-to-reduce-anxiet...","{\n ""art-therapy-practitioner-training-course...","{\n ""9-art-therapy-exercises-to-reduce-anxiet...",courses/AT
1,AT,Dutch,nl,nl.scholistico,Kunstzinnige Therapie Opleiding,https://nl.scholistico.com/product/kunstzinnig...,"{\n ""kunstzinnige-therapie-oefeningen-voor-au...","{\n ""kunstzinnige-therapie-opleiding"": ""https...","{\n ""kunstzinnige-therapie-oefeningen-voor-au...",courses/AT
2,AT,Italian,it,it.scholistico,Corso Di Formazione In Arteterapia,https://it.scholistico.com/prodotto/corso-di-f...,"{\n ""5-benefici-dellarteterapia-per-gli-adult...","{\n ""corso-di-formazione-in-arteterapia"": ""ht...","{\n ""5-benefici-dellarteterapia-per-gli-adult...",courses/AT
3,AT,German,de,de.scholistico,Ausbildung In Kunsttherapie,https://de.scholistico.com/produkt/ausbildung-...,"{\n ""7-kunsttherapieuebungen-fuer-menschen-mi...","{\n ""ausbildung-in-kunsttherapie"": ""https://d...","{\n ""7-kunsttherapieuebungen-fuer-menschen-mi...",courses/AT
4,AT,Spanish,es,es.scholistico,Curso Certificacion Practicante Arteterapia,https://es.scholistico.com/producto/curso-cert...,"{\n ""actividades-arteterapia-para-ansiedad"": ...","{\n ""curso-certificacion-practicante-artetera...","{\n ""actividades-arteterapia-para-ansiedad"": ...",courses/AT
5,NAT,English,en,scholistico,Naturopathy Practitioner Training Course,https://scholistico.com/product/naturopathy-pr...,"{\n ""8-herbal-remedies-for-dry-skin"": ""https:...","{\n ""naturopathy-practitioner-training-course...","{\n ""8-herbal-remedies-for-dry-skin"": ""https:...",courses/NAT
6,NAT,Dutch,nl,nl.scholistico,Natuurgeneeskunde Opleiding Online,https://nl.scholistico.com/product/natuurgenee...,"{\n ""4-natuurlijke-huismiddeltjes"": ""https://...","{\n ""natuurgeneeskunde-opleiding-online"": ""ht...","{\n ""4-natuurlijke-huismiddeltjes"": ""https://...",courses/NAT
7,NAT,Italian,it,it.scholistico,Corso Di Formazione Per Operatori Di Naturopatia,https://it.scholistico.com/prodotto/corso-di-f...,"{\n ""naturopatia-principi-pratiche-e-benefici...","{\n ""corso-di-formazione-per-operatori-di-nat...","{\n ""naturopatia-principi-pratiche-e-benefici...",courses/NAT
8,NAT,German,de,de.scholistico,Ausbildung Zum Naturheilpraktiker,https://de.scholistico.com/produkt/ausbildung-...,"{\n ""naturheilpraktiker-7-gruende-warum-jetzt...","{\n ""ausbildung-zum-naturheilpraktiker"": ""htt...","{\n ""naturheilpraktiker-7-gruende-warum-jetzt...",courses/NAT
9,NAT,Spanish,es,es.scholistico,Curso Certificacion Practicante Naturopatia,https://es.scholistico.com/producto/curso-cert...,"{\n ""guia-naturopatia-principiantes"": ""https:...","{\n ""curso-certificacion-practicante-naturopa...","{\n ""guia-naturopatia-principiantes"": ""https:...",courses/NAT


In [40]:
course_df.to_csv('course_data_full.csv',index=False)

In [57]:
import requests
from requests.auth import HTTPBasicAuth

# WordPress site details
site_url = "https://scholistico.com/wp-json/wp/v2"
username = "rishabh.deepak.shukla@gmail.com"
password = "gUVF aLy4 3c1j 63gP jEKS rykg"

# Post details
post_title = "Your Blog Post Title"
html_content = "<p>This is your blog content with HTML formatting.</p>"
post_slug = "your-blog-post-slug"
tags = ["Python", "Automation", "WordPress"]  # List of tags
categories = ["Programming", "Tutorials"]    # List of categories
rank_math_meta = {
    "rank_math_title": "Your Rank Math Optimized Title",
    "rank_math_focus_keyword": "keyword1, keyword2",
    "rank_math_permalink": post_slug
}

# Upload an image and get its media ID
def upload_image(image_path):
    headers = {
        "Authorization": f"Basic {username}:{password}"
    }
    with open(image_path, "rb") as img_file:
        files = {"file": img_file}
        response = requests.post(
            f"{site_url}/media",
            auth=HTTPBasicAuth(username, password),
            files=files
        )
    if response.status_code == 201:
        return response.json().get("id")
    else:
        print("Failed to upload image:", response.json())
        return None

# Fetch tag IDs or create them
def get_or_create_tags(tags_list):
    tag_ids = []
    for tag in tags_list:
        response = requests.get(
            f"{site_url}/tags?search={tag}",
            auth=HTTPBasicAuth(username, password)
        )
        if response.status_code == 200 and response.json():
            tag_ids.append(response.json()[0]["id"])
        else:
            # Create the tag if it doesn't exist
            response = requests.post(
                f"{site_url}/tags",
                auth=HTTPBasicAuth(username, password),
                json={"name": tag}
            )
            if response.status_code == 201:
                tag_ids.append(response.json()["id"])
    return tag_ids

# Fetch category IDs or create them
def get_or_create_categories(categories_list):
    category_ids = []
    for category in categories_list:
        response = requests.get(
            f"{site_url}/categories?search={category}",
            auth=HTTPBasicAuth(username, password)
        )
        if response.status_code == 200 and response.json():
            category_ids.append(response.json()[0]["id"])
        else:
            # Create the category if it doesn't exist
            response = requests.post(
                f"{site_url}/categories",
                auth=HTTPBasicAuth(username, password),
                json={"name": category}
            )
            if response.status_code == 201:
                category_ids.append(response.json()["id"])
    return category_ids

# Main function to create a post
def create_post():
    # Upload the featured image (optional)
    image_path = "Blogs/Organization/images/ffeeaadf/content-8b074aac.png"
    featured_image_id = upload_image(image_path)

    # Prepare the post data
    post_data = {
        "title": post_title,
        "content": html_content,
        "slug": post_slug,
        "status": "draft",
        "tags": get_or_create_tags(tags),
        "categories": get_or_create_categories(categories),
        "meta": rank_math_meta,  # Rank Math SEO metadata
    }

    if featured_image_id:
        post_data["featured_media"] = featured_image_id

    # Create the post
    response = requests.post(
        f"{site_url}/posts",
        auth=HTTPBasicAuth(username, password),
        json=post_data
    )

    if response.status_code == 201:
        print("Post created successfully!")
        print("Post ID:", response.json().get("id"))
    else:
        print("Failed to create post:", response.json())

# Run the script
create_post()


Post created successfully!
Post ID: 158691


In [32]:

html_snippet = extract_post_details("Blogs/Organization/html/final_output/3f5ed71c.html")
print(html_snippet)


{'title': '7 Ways To Create Mind-Body Connection For Holistic Wellness', 'slug': '7-ways-to-create-mind-body-connection-for-holistic-wellness', 'meta': 'Discover 7 effective ways to create a mind-body connection for holistic wellness. Learn practical techniques to improve your overall health and well-being.', 'content': '<!-- wp:heading -->\n\n<h2 class="wp-block-heading" content_id="content-81593600">Table of Contents</h2>\n\n<!-- /wp:heading -->\n\n<!-- wp:list -->\n\n<ul class="wp-block-list" content_id="content-82735df5">\n\n<!-- wp:list-item -->\n\n<li content_id="content-d2d72d52">\n\n<a content_id="content-1183a593" href="#introduction">Introduction</a>\n\n</li>\n\n<!-- /wp:list-item -->\n\n<!-- wp:list-item -->\n\n<li content_id="content-b010e6c9">\n\n<a content_id="content-b78bb972" href="#mindful-meditation">Mindful Meditation</a>\n\n</li>\n\n<!-- /wp:list-item -->\n\n<!-- wp:list-item -->\n\n<li content_id="content-0165b9e2">\n\n<a content_id="content-0c1b8941" href="#yoga-p

In [66]:
def clean_translation(text: str) -> str:
    """Clean the translation output by removing unwanted patterns."""
    # Remove text within square brackets
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove any "Translation:" or similar prefixes
    text = re.sub(r'^(Translation:|Translated text:|Here\'s the translation:)\s*', '', text, flags=re.IGNORECASE)
    
    # Remove quotes if they wrap the entire text
    text = re.sub(r'^["\'](.*)["\']$', r'\1', text)
    
    # Remove any trailing or leading periods if they're not part of the original text
    text = text.strip('.')
    
    # Remove any extra spaces
    text = ' '.join(text.split())
    
    return text


def translate_text1(input_string: str, target_language: str) -> str:
    """Translate text using Perplexity AI."""
    import os
    
    # Get API key from environment variable
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    # API endpoint
    url = "https://api.perplexity.ai/chat/completions"

    # Headers with API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Modified prompt to get cleaner responses
    payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": (
                    f"You are a translator to {target_language}. Provide direct translations "
                    "without explanations, citations, or additional formatting. Return only "
                    "the translated text."
                )
            },
            {
                "role": "user",
                "content": f"Translate this text: {input_string}"
            }
        ]
    }

    try:
        # Make the API request
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()

        # Extract and clean the translated text
        result = response.json()
        translated_text = result['choices'][0]['message']['content'].strip()
        cleaned_text = clean_translation(translated_text)
        
        return cleaned_text

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the Perplexity API: {e}")
        return input_string
    except KeyError as e:
        print(f"Unexpected response structure: {e}")
        return input_string

def process_dataframe(df):
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        language = row['language']
        
        # Check if language is not English
        if language.lower() != 'english':
            # Process Categories
            if pd.notna(row['Category']):  # Check if Category is not NaN
                categories = row['Category'].split('|')
                translated_categories = []
                
                # Translate each category
                for category in categories:
                    translated_category = translate_text1(category.strip(), language)
                    translated_categories.append(translated_category)
                
                # Update the DataFrame with translated categories
                df.at[index, 'Category'] = '|'.join(translated_categories)
            
            # Process Tags
            if pd.notna(row['Tags']):  # Check if Tags is not NaN
                tags = row['Tags'].split('|')
                translated_tags = []
                
                # Translate each tag
                for tag in tags:
                    translated_tag = translate_text1(tag.strip(), language)
                    translated_tags.append(translated_tag)
                
                # Update the DataFrame with translated tags
                df.at[index, 'Tags'] = '|'.join(translated_tags)
    
    return df
updated_course_df=process_dataframe(course_df)

In [68]:
updated_course_df.Category=updated_course_df.Category.str.replace('\n','')
updated_course_df.Tags=updated_course_df.Tags.str.replace('\n','')

In [70]:
updated_course_df.to_csv('course_data_full.csv',index=False)

In [85]:
import requests
import os
import random

def find_relevant_tags(blog_title: str, tag_list: list) -> list:
    """Find 2-3 most relevant tags from a randomly sorted list for a given blog title."""
    
    # Create a copy of the tag list and shuffle it
    shuffled_tags = tag_list.copy()
    random.shuffle(shuffled_tags)
    
    # Get API key from environment variable
    api_key = os.getenv("PERPLEXITY_API_KEY")
    if not api_key:
        raise ValueError("PERPLEXITY_API_KEY environment variable not set")

    # API endpoint
    url = "https://api.perplexity.ai/chat/completions"

    # Headers with API key
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Create the prompt with shuffled tags
    prompt = f"""Given the blog title: "{blog_title}"
    And this list of tags: {', '.join(shuffled_tags)}
    Return only 2-3 most relevant tags that best match the blog title's topic.
    Provide the response as a comma-separated list without explanations."""

    # Payload for the API
    payload = {
        "model": "llama-3.1-sonar-huge-128k-online",
        "messages": [
            {
                "role": "system",
                "content": "You are a content categorization expert. Provide direct, concise responses with only the most relevant tags."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    try:
        # Make the API request
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()

        # Extract the suggested tags
        result = response.json()
        suggested_tags = result['choices'][0]['message']['content'].strip()
        
        # Convert the comma-separated string to list and clean up
        tag_recommendations = [tag.strip() for tag in suggested_tags.split(',')]
        
        # Ensure we return only tags that exist in the original tag_list
        final_tags = [tag for tag in tag_recommendations if tag in tag_list][:3]
        
        return final_tags

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the Perplexity API: {e}")
        return []
    except KeyError as e:
        print(f"Unexpected response structure: {e}")
        return []

# Example usage:
if __name__ == "__main__":
    # Set random seed for reproducibility (optional)
    random.seed()
    
    blog_title = "10 Essential Python Tips for Data Science"
    available_tags = [

        "programming", 
        "machine-learning", 
        "web-development",
        "artificial-intelligence",
        "database",
        "cloud-computing",
                "python", 
        "data-science", 
    ]
    
    # Print original order
    print("Original tag list:")
    print(available_tags)
    
    # Get relevant tags
    relevant_tags = find_relevant_tags(blog_title, available_tags)
    
    print(f"\nSuggested tags for '{blog_title}':")
    print(relevant_tags)
    
    # You can call it multiple times to see different random orders
    print("\nTrying again with different random order:")
    relevant_tags_2 = find_relevant_tags(blog_title, available_tags)
    print(relevant_tags_2)

Original tag list:
['programming', 'machine-learning', 'web-development', 'artificial-intelligence', 'database', 'cloud-computing', 'python', 'data-science']

Suggested tags for '10 Essential Python Tips for Data Science':
['python', 'data-science', 'machine-learning']

Trying again with different random order:
['python', 'data-science', 'machine-learning']


In [3]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=015a3ffaf41b13626551c6491e0e29989f3f91eb0ae4e4f4f213a095ac6f523e
  Stored in directory: /Users/rishabhshukla/Library/Caches/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [2]:
def optimize_prompt(prompt, context):
    # Construct the input for Claude
    input_text = f"""You are tasked with optimizing a prompt for DALLE image generation to create proper images for a blog post. Your goal is to refine the given prompt while considering the context of a previous image.

                    First, you will be provided with two inputs:
                    <prompt>{prompt}</prompt>
                    This is the initial prompt for DALLE image generation.

                    <content>"{context}"</content>
                    This represents the context from the previous image.

                    Your Key Focus should be of generating prompts for realistic images

                    To optimize the prompt for DALLE, follow these guidelines:

                    1. Analyze the given prompt and the previous image context.
                    2. Don't make the image too similar to the previous image.
                    3. Ensure the prompt is clear, specific, and descriptive.
                    4. Include relevant details from the previous image context to maintain consistency.
                    5. Use vivid and precise language to describe the desired image.
                    6. Incorporate artistic elements like style, mood, and composition if appropriate.
                    7. Avoid any potential copyright issues or explicit content.

                    Remember that DALLE has a token limit of 1000. To handle this:
                    - Keep your optimized prompt concise and within the token limit.
                    - Prioritize the most important elements of the image description.
                    - Remove any unnecessary words or repetitive information.

                    Provide your optimized prompt within <optimized_prompt> tags. After the optimized prompt, include a brief explanation of your changes and reasoning within <explanation> tags.

                    Your response should follow this format:

                    <optimized_prompt>
                    [Your optimized prompt here]
                    </optimized_prompt>

                    """

    # Call the Anthropic API
    # response = client.message.create(
    # model="claude-3-5-sonnet-20240620",
    # prompt=f"You are a prompt engineer who generates prompts for realistic image generation.\n\nUser: {input_text}",
    # max_tokens_to_sample=4000,
    # temperature=0.7
    # )

    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=4000,
        temperature=0.7,
        system="You are a prompt engineer who generates ONLY realistic Images. Images as close to real and human as possible",
        messages=[
            {
                "role": "user",
                "content": input_text
            }
        ]
    )

    # Extract the optimized prompt and explanation from the response
    full_response = response.content[0].text
    optimized_prompt = full_response.split('<optimized_prompt>')[1].split('</optimized_prompt>')[0].strip()
    # explanation = full_response.split('<explanation>')[1].split('</explanation>')[0].strip()

    return optimized_prompt



prompt="some prompt"

import fal_client

def on_queue_update(update):
    if isinstance(update, fal_client.InProgress):
        for log in update.logs:
           print(log["message"])

result = fal_client.subscribe(
    "fal-ai/flux-pro/v1.1",
    arguments={
        "prompt": f"""
          {prompt}. Ultra Realisitic images
        """
    },
    with_logs=True,
    on_queue_update=on_queue_update,
)
print(result['images'][0]['url'])



https://fal.media/files/tiger/AjbVSwar9_GOYUKEqLHQz_5a91005725884e05801596d930bbf336.jpg
