In [6]:
import os
import pandas as pd
import google.generativeai as genai

# Gemini API key
genai.configure(api_key="AIzaSyBYRhdREJVe-7CIKDIxFuVal8RCsSnMFaw")

# Create the model configuration
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

def upload_to_gemini(path, mime_type="image/jpeg"):
    """Uploads the file to the Gemini API and returns the file URI."""
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file.uri

def analyze_newspaper_images(image_folder):
    """
    Upload images to the Gemini API and extract categorized text.
    """
    image_paths = [os.path.join(image_folder, f"page_{i}.jpg") for i in range(1, 21)]
    
    uploaded_files = [upload_to_gemini(path) for path in image_paths]

    # List out the URIs of the uploaded images to pass to the model
    uploaded_file_uris = "\n".join(uploaded_files)

    prompt = (
        "You’re an advanced information analyst with extensive experience in extracting and analyzing text data from images of newspapers. Your expertise lies in categorizing information effectively to ensure comprehensive, organized insights across various fields, making it easier for analysis and reporting."

"Your task is to extract and organize information from newspaper images." 
" Categories to Organize Under:" 
"1. Sports: Player Names, Venue, Team/Clubs, Level of Play, Match Date, Outcome, Player Performance Highlights\n"
"2. Politics: Politician Names, Political Party, Event Location, Policy Changes, Election Results\n"  
"3. Entertainment: Celebrity Names, Movie/Show Titles, Award Nominations, Event Location, Genre\n" 
"4. Business: Company Names, Financial Reports, Stock Market Trends, Industry Sector, Economic Indicators\n"  
"5. Technology: Tech Company Names, Product Launches, Innovation Highlights, Tech Trends, User Reviews\n" 
"6. Health: Disease/Condition, Health Tips, Research Findings, Medical Guidelines, Health Policy Changes\n"
"7. World News: Country Names, Global Events, Humanitarian Issues, Diplomatic Relations, Economic Impact\n" 

"Please ensure that the extracted information is well-organized and categorized according to the specified categories and is formatted in a way that can be easily converted to XLSX or CSV format."
    )

    # Initialize chat session
    chat_session = model.start_chat(history=[])
    
    # Send prompt with the uploaded image links
    response = chat_session.send_message(prompt)

    print("Extracted Text:\n", response.text)  # Print the extracted content for debugging
    return response.text

def parse_and_categorize_text(extracted_text):
    """
    Parse the extracted text and categorize it into predefined news categories with subcategories.
    """
    categories = {
        "Sports": {
            "Player Names": [],
            "Venue": [],
            "Team/Clubs": [],
            "Level of Play": [],
            "Match Date": [],
            "Match Outcome": [],
            "Player Performance Highlights": [],
        },
        "Politics": {
            "Politician Names": [],
            "Political Party": [],
            "Event Location": [],
            "Policy Changes": [],
            "Election Results": [],
        },
        "Entertainment": {
            "Celebrity Names": [],
            "Movie/Show Titles": [],
            "Award Nominations": [],
            "Event Location": [],
            "Genre": [],
        },
        "Business": {
            "Company Names": [],
            "Financial Reports": [],
            "Stock Market Trends": [],
            "Industry Sector": [],
            "Economic Indicators": [],
        },
        "Technology": {
            "Tech Company Names": [],
            "Product Launches": [],
            "Innovation Highlights": [],
            "Tech Trends": [],
            "User Reviews": [],
        },
        "Health": {
            "Disease/Condition": [],
            "Health Tips": [],
            "Research Findings": [],
            "Medical Guidelines": [],
            "Health Policy Changes": [],
        },
        "World News": {
            "Country Names": [],
            "Global Events": [],
            "Humanitarian Issues": [],
            "Diplomatic Relations": [],
            "Economic Impact": [],
        }
    }

    # Split the extracted text into lines and match categories
    lines = extracted_text.split("\n")
    current_category = None

    for line in lines:
        line = line.strip()

        # Detect the main category
        for category in categories.keys():
            if category.lower() in line.lower():
                current_category = category
                break

        # Assign content to subcategories within the current category
        if current_category:
            for subcategory in categories[current_category]:
                if subcategory.lower() in line.lower() and line not in categories[current_category][subcategory]:
                    categories[current_category][subcategory].append(line)

    return categories

def save_to_file(categorized_data, output_path, file_format='xlsx'):
    """
    Saves the categorized data to an Excel or CSV file.
    """
    dataframes = []
    
    for category, subcategories in categorized_data.items():
        category_data = []
        for subcategory, content in subcategories.items():
            if content:  # Only include non-empty content
                category_data.append({
                    "Category": category,
                    "Subcategory": subcategory,
                    "Content": "; ".join(content)
                })
        if category_data:  # Only create DataFrame if there is data
            df = pd.DataFrame(category_data)
            dataframes.append(df)

    if not dataframes:
        print("No data to save. Exiting.")
        return
    
    final_df = pd.concat(dataframes, ignore_index=True)
    
    if file_format == 'xlsx':
        final_df.to_excel(output_path, index=False)
    elif file_format == 'csv':
        final_df.to_csv(output_path, index=False)
    else:
        print("Unsupported file format. Please use 'xlsx' or 'csv'.")

    print(f"Data saved to {output_path}")

# Folder containing the newspaper images
image_folder = r"C:\Users\abi04\OneDrive\Desktop\My projects\New folder"

# Step 1: Analyze newspaper images
extracted_text = analyze_newspaper_images(image_folder)

# Step 2: Parse and categorize the text
categorized_data = parse_and_categorize_text(extracted_text)

# Step 3: Save categorized data to XLSX or CSV
output_file_path = r"C:\Users\abi04\OneDrive\Desktop\My projects\New folder\test_data_2.xlsx"
save_to_file(categorized_data, output_file_path, file_format='xlsx')  # Change to 'csv' if needed


Uploaded file 'page_1.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/v4yaaricyyzx
Uploaded file 'page_2.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/77n0xera16uw
Uploaded file 'page_3.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/dz5qnfgs6io9
Uploaded file 'page_4.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/m03sbgefsf7
Uploaded file 'page_5.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/py3x2bf49t9b
Uploaded file 'page_6.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/iogr2vpxeitf
Uploaded file 'page_7.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/b1k0ve8sxypq
Uploaded file 'page_8.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/vixsk1qhawes
Uploaded file 'page_9.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/tu5y1kds964e
Uploaded file 'page_10.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/p7x4s7zuaerb
Uploaded f

newer part

In [9]:
import os
import pandas as pd
import google.generativeai as genai

# Gemini API key
genai.configure(api_key="AIzaSyBYRhdREJVe-7CIKDIxFuVal8RCsSnMFaw")

# Create the model configuration
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

def upload_to_gemini(path, mime_type="image/jpeg"):
    """Uploads the file to the Gemini API and returns the file URI."""
    file = genai.upload_file(path, mime_type=mime_type)
    print(f"Uploaded file '{file.display_name}' as: {file.uri}")
    return file.uri

def analyze_newspaper_images(image_folder):
    """
    Upload images to the Gemini API and extract categorized headlines.
    """
    # Load all images named from page_1.jpg to page_20.jpg in the local directory
    image_paths = [os.path.join(image_folder, f"page_{i}.jpg") for i in range(3, 21)]
    
    # Upload each image to Gemini API
    uploaded_files = [upload_to_gemini(path) for path in image_paths]

    # Combine URIs of the uploaded images to pass to the model in the prompt
    uploaded_file_uris = "\n".join(uploaded_files)

    # Updated prompt to extract headlines based on the categories
    prompt = (
        "You are an advanced information analyst tasked with extracting the text of newspaper images ,analyzing and extracting only the headlines. "
        "I have named these imagesof newspaper as page_3 to page_20"
        "Please extract the headlines and organize them under the following categories:\n"
        "1. city\n"
        "2. Politics\n"
        "3. Entertainment\n"
        "4. Business\n"
        "5. Technology\n"
        "6. Health\n"
        "7. World News\n"
        "8.Sports \n"
        "Please extract only the relevant headlines under these categories accurately and generate the results which can easily understandable for formating in xlsx or csv."
    )

    # Initialize chat session
    chat_session = model.start_chat(history=[])
    
    # Send prompt with the uploaded image links
    response = chat_session.send_message(prompt)

    print("Extracted Headlines:\n", response.text)  # Print the extracted content for debugging
    return response.text

def parse_and_categorize_headlines(extracted_text):
    """
    Parse the extracted text and categorize it into predefined news categories with subcategories.
    """
    categories = {
        "Sports": [],
        "Politics": [],
        "Entertainment": [],
        "Business": [],
        "Technology": [],
        "Health": [],
        "World News": []
    }

    # Split the extracted text into lines and match categories
    lines = extracted_text.split("\n")
    current_category = None

    for line in lines:
        line = line.strip()

        # Detect the main category
        for category in categories.keys():
            if category.lower() in line.lower():
                current_category = category
                break

        # Assign headlines to the current category
        if current_category and line and not any(line.lower() in sub.lower() for sub in categories[current_category]):
            categories[current_category].append(line)

    return categories

def save_to_file(categorized_headlines, output_path, file_format='xlsx'):
    """
    Saves the categorized headlines to an Excel or CSV file.
    """
    dataframes = []
    
    for category, headlines in categorized_headlines.items():
        if headlines:  # Only include non-empty headlines
            category_data = [{
                "Category": category,
                "Headline": headline
            } for headline in headlines]
            df = pd.DataFrame(category_data)
            dataframes.append(df)

    if not dataframes:
        print("No data to save. Exiting.")
        return
    
    final_df = pd.concat(dataframes, ignore_index=True)
    
    if file_format == 'xlsx':
        final_df.to_excel(output_path, index=False)
    elif file_format == 'csv':
        final_df.to_csv(output_path, index=False)
    else:
        print("Unsupported file format. Please use 'xlsx' or 'csv'.")

    print(f"Headlines saved to {output_path}")

# Folder containing the newspaper images (named as page_1.jpg to page_20.jpg)
image_folder = r"C:\Users\abi04\OneDrive\Desktop\My projects\New folder"

# Step 1: Analyze newspaper images and extract headlines
extracted_text = analyze_newspaper_images(image_folder)

# Step 2: Parse and categorize the headlines
categorized_headlines = parse_and_categorize_headlines(extracted_text)

# Step 3: Save categorized headlines to XLSX or CSV
output_file_path = r"C:\Users\abi04\OneDrive\Desktop\My projects\New folder\new_data.xlsx"
save_to_file(categorized_headlines, output_file_path, file_format='xlsx')  # Change to 'csv' if needed


Uploaded file 'page_3.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/nlj5b6wt0ysd
Uploaded file 'page_4.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/b0at9eosvn3s
Uploaded file 'page_5.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/4czwgu5fwjyn
Uploaded file 'page_6.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/uff95p9k626j
Uploaded file 'page_7.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/2u8d4wkszv8u
Uploaded file 'page_8.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/myawabtpgvg7
Uploaded file 'page_9.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/i1eiu7dkjxlp
Uploaded file 'page_10.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/6zhvywtixdck
Uploaded file 'page_11.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/k5xkt56agdps
Uploaded file 'page_12.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/su0ig1gm08yu
Uploade

In [29]:
import os
import pandas as pd
import google.generativeai as genai

# Gemini API key
genai.configure(api_key="AIzaSyBYRhdREJVe-7CIKDIxFuVal8RCsSnMFaw")

# Create the model configuration
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

def upload_to_gemini(path, mime_type="image/jpeg"):
    """Uploads the file to the Gemini API and returns the file URI."""
    try:
        file = genai.upload_file(path, mime_type=mime_type)
        if file:
            print(f"Uploaded file '{file.display_name}' as: {file.uri}")
            return file.uri
        else:
            print(f"Failed to upload file '{path}'.")
            return None
    except Exception as e:
        print(f"Error uploading file '{path}': {e}")
        return None

def analyze_newspaper_images(image_folder):
    """
    Upload images to the Gemini API and extract categorized headlines.
    """
    image_paths = [os.path.join(image_folder, f"page_{i}.jpg") for i in range(3, 21)]
    
    # Upload each image to Gemini API
    uploaded_files = [upload_to_gemini(path) for path in image_paths]
    
    # Filter out any failed uploads
    uploaded_files = [uri for uri in uploaded_files if uri]

    if not uploaded_files:
        print("No images were successfully uploaded.")
        return ""

    # Combine URIs of the uploaded images to pass to the model in the prompt
    uploaded_file_uris = "\n".join(uploaded_files)

    # Updated prompt to extract headlines based on the categories
    prompt = (
        "You are an advanced information analyst with extensive experience in extracting and organizing data from various sources, including images and text. Your expertise lies in accurately interpreting information and categorizing it for easy comprehension.\n"
        "Your task is to extract the text of the news headlines from the newspaper images I have uploaded, using the following URIs:\n"
        f"{uploaded_file_uris}\n\n"
        "Please organize the extracted headlines under the following categories:\n"
        "1. Sports\n"
        "2. Politics\n"
        "3. Entertainment\n" 
        "4. Business\n"
        "5. Technology\n"
        "6. Health\n"
        "7. World News\n"
        "It's essential to maintain the format as structure= for the easy format of the data as we need to convert to xlsx or csv format."
    )

    # Initialize chat session
    chat_session = model.start_chat(history=[])
    
    try:
        # Send prompt with the uploaded image links
        response = chat_session.send_message(prompt)
        print("Extracted Headlines:\n", response.text)  # Print the extracted content for debugging
        return response.text
    except Exception as e:
        print(f"Error during chat session: {e}")
        return ""

def parse_and_categorize_headlines(extracted_text):
    """
    Parse the extracted text and categorize it into predefined news categories.
    """
    categories = {
        "City": [],
        "Politics": [],
        "Entertainment": [],
        "Business": [],
        "Technology": [],
        "Health": [],
        "World News": [],
        "Sports": []
    }

    # Split the extracted text into lines and detect categories
    lines = extracted_text.split("\n")
    current_category = None

    for line in lines:
        line = line.strip()

        # Check if line matches any category
        for category in categories:
            if category.lower() in line.lower():
                current_category = category
                break

        # Assign the line to the correct category
        if current_category and line:
            categories[current_category].append(line)

    return categories

def save_to_file(categorized_headlines, output_path, file_format='xlsx'):
    """
    Save the categorized headlines to an Excel or CSV file.
    """
    data = []

    # Prepare the data for saving
    for category, headlines in categorized_headlines.items():
        for headline in headlines:
            data.append({"Category": category, "Headline": headline})

    df = pd.DataFrame(data)

    # Save to the appropriate format
    if file_format == 'xlsx':
        df.to_excel(output_path, index=False)
    elif file_format == 'csv':
        df.to_csv(output_path, index=False)
    else:
        print("Unsupported file format. Please use 'xlsx' or 'csv'.")

    print(f"Headlines saved to {output_path}")

# Folder containing the newspaper images (named as page_3.jpg to page_20.jpg)
image_folder = r"C:\Users\abi04\OneDrive\Desktop\My projects\New folder"

# extract the newspaper text 
extracted_text = analyze_newspaper_images(image_folder)

if extracted_text:
    categorized_headlines = parse_and_categorize_headlines(extracted_text)

    
    output_file_path = r"C:\Users\abi04\OneDrive\Desktop\My projects\New folder\categorize_news_data.xlsx"
    save_to_file(categorized_headlines, output_file_path, file_format='xlsx')  


Uploaded file 'page_3.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/lmb0zonkrj7t
Uploaded file 'page_4.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/acklf57txafm
Uploaded file 'page_5.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/1ttr1isgj3ej
Uploaded file 'page_6.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/agbloi2gvwfi
Uploaded file 'page_7.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/l44grgyuambb
Uploaded file 'page_8.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/53ikmdu8t4ns
Uploaded file 'page_9.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/s29eiwt9ika7
Uploaded file 'page_10.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/mzzaoamiqtiy
Uploaded file 'page_11.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/vyy5usk8jcy
Uploaded file 'page_12.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/1mxfuxybebmd
Uploaded