In [2]:
# importing the required Libraries

import os
from PIL import Image
import pytesseract
from langchain_community.llms import Ollama
from collections import Counter
import datetime



# importing the pytesseract.exe

pytesseract.pytesseract.tesseract_cmd =  r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# importing the folder containing images

folder_path  = r"C:\Users\sehri\Desktop\ai-agent\screenshots"
image_files = []
all_files = os.listdir(folder_path)

# loop to get all image files

for file_name in all_files:
    if file_name.lower().endswith(('.jpg','.png', '.jpeg')):
        image_files.append(file_name)
    
# function to set up the llm and its prompt

llm = Ollama(model = 'mistral')
def classify_action(text):
    prompt = f""" You are an activity listener. Your job is to tell what the user is doing based on the text. Keep your answers succint and informative
Text: \"\"\" {text}\"\"\"

Keep the answer to just one activity label such as Coding, Browsing, Using Youtube, Idle, Using Excel etc
"""
    try:
        response = llm.invoke(prompt)
        return response.strip()
        
    except Exception as e:
        print('Mistral failed to work, there is an error')
        print(f"Error: {e}")

# function to group the activity labels

def group_activity(label):
    label = label.lower()
    if "code" in label or "python" in label or "jupyter" in label:
        return "Coding"
    elif "youtube" in label or "video" in label:
        return "YouTube"
    elif "excel" in label or "sheet" in label:
        return "Excel"
    elif "browse" in label or "chrome" in label or "firefox" in label:
        return "Browsing"
    elif "idle" in label or "nothing" in label:
        return "Idle"
    else:
        return "Other"

Grouped_Activities = []
activity_log = []



# loop for reading handling images

for i, image_name in enumerate(image_files, start = 1):
    # setting up the path of images
    image_path = os.path.join(folder_path, image_name)
    # reading images using pillow
    img = Image.open(image_path)
    

    print(f" Processing Image number {i}")
    print("\n" + "#" * 70)
    print(f"Image Size: {img.size}")
    print(f"Image Mode: {img.mode}")
    print(f"\n Image Text")
    extracted_text = pytesseract.image_to_string(img)
    print(extracted_text)

    timestamps = os.path.getctime(image_path)
    formatted_time = datetime.datetime.fromtimestamp(timestamps).strftime('%Y-%m-%d %H:%M:%S')

    if extracted_text:
        print(f"\n Analyzing using Mistral")
        activity_label = classify_action(extracted_text)
        print(f"The activity being perfomed in this image is: {activity_label}")

        grouped = group_activity(activity_label)
        Grouped_Activities.append(grouped)
        activity_log.append((formatted_time, grouped))
        
    else:
        print(f"\n No text present")
        Grouped_Activities.append('Idle')
        activity_log.append((formatted_time, 'Idle'))

# Printing the Grouped Activities Summuary

print("\n" + "=" * 50)
print(" summuary of Grouped Activities:")
counts = Counter(Grouped_Activities)
for activity, count in counts.items():
    print(f"- {activity}: {count} screenshot(s)")

# Printing the log Activities

print("\n" + "=" * 50)
print("Timestamped Activity Log:")
for time, activity in sorted(activity_log):
    print(f"{time} → {activity}")


              
        
        




  llm = Ollama(model = 'mistral')


 Processing Image number 1

######################################################################
Image Size: (1225, 465)
Image Mode: RGBA

 Image Text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import r2_score

# from skLearn.feature_selection import GridSearchCV

data = load_diabetes()
df = pd.DataFrame(data.data, columns = data.feature names)
df[‘target'] = data.target

df -head(3)

si = SimpleImputer(strategy = 'most_frequent')
imputed_data = si.fit_transform(df[data.feature_names])

imputed_data =pd.DataFrame(data=imputed_data, columns = data.feature names)

imputed_data.head(3)


 Analyzing using Mistral
The activity being perfomed in this image is: Coding (using libraries like sklearn, pandas)
 Processing Image number 2

######################################################################
Image Size: (1342, 721)
Image Mode: RGBA

 Image Text
J File Edit Se

# Setting up the AI agent


In [None]:
# importing the required libraries

from langchain.agents import Tool, initialize_agent
from langchain.agents.agent_types  import AgentType

# tool # 1

def classify_image_tool(image_name : str) -> str:
    image_path = os.path.join(folder_path, image_name)
    if not os.path.exists(image_path):
        return f"Image File: {image_name} not found."
    img = Image.open(image_name)
    extracted_text = pytesseract.image_to_string(img)

    if not extracted_text.strip():
        label = "idle"
    else:
        label = classify_action(extracted_text)

    grouped = group_activity(label)
    Grouped_Activities.append(grouped)
    return f"{image_name} grouped as: {grouped}"

# tool # 2: Activity Summuary

def activity_summuary(_:str) -> str:
    if not Grouped_Activities:
        return "No Activities have been grouped or classified yet"
    else:
        summuary = Counter(Grouped_Activities)
        return "\n".join([f"{k}: {v}" for k, v in summuary.items()])

# tool # 3: Activity Log

def get_activity_log(_:str) -> str:
    if not activity_log:
        return "No Timestamp available"
    lines = [f"{time} → {activity}" for time, activity in sorted(activity_log)]
    return "\n".join(lines)


# Defining Langchain Tools


tools = [
    Tool(
        name="Classification_tool",
        func=classify_image_tool,
        description="Classify a screenshot into an activity label based on the filename input."
    ),
    Tool(
        name="Activity_Summary",
        func=activity_summuary,
        description="Summarize how many times each activity was detected from processed screenshots."
    ),

    Tool(
        name = "Activity Log",
        func = get_activity_log,
        description = "Your job is to list down all the activities along with their timestamp detection from screenshots"
    )
]


agent =  initialize_agent(
    tools = tools,
    llm = Ollama(model = 'mistral'),
    agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose = True,
    handle_parsing_errors = True
  
)


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



agent.run("Classify image named 'Screenshot 2025-07-12 211031'")

        
    

  agent =  initialize_agent(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m To classify the image based on its filename, I will use the `Classification_tool`. The function takes a string as input which represents the name of the image. In this case, the image name is 'Screenshot 2025-07-12 211031'.

Action: Classification_tool
Action Input: 'Screenshot 2025-07-12 211031'[0m
Observation: [36;1m[1;3mImage File: 'Screenshot 2025-07-12 211031' not found.[0m
Thought: