# Assignment Time Estimation

# 1. Generate Data 

### 1.1 Generate Assignment Content

#### Import Libraries

In [1]:
import pandas as pd
import os
import io
import re
import requests
import vertexai
import PyPDF2
from google.cloud import storage

#### Notebook Environment

In [2]:
PROJECT_ID = ""
LOCATION = ""

vertexai.init(project=PROJECT_ID, location=LOCATION)

#### GCS Bucket

In [3]:
bucket_name = ''
folder_name = ''

In [4]:
# Function to Load PDF Content from GCS and Clean It
def load_pdf_from_gcs(bucket_name, blob_name):
    """Load PDF content from Google Cloud Storage and clean it."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Download the Blob's Content as Bytes
    pdf_content = blob.download_as_bytes()
    
    # Use PyPDF2 to Extract Text from the PDF cCntent
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
    text_content = ""
    for page_num in range(len(pdf_reader.pages)):
        page_text = pdf_reader.pages[page_num].extract_text()
        text_content += " " + page_text.strip() if page_text else ""
    
    # Clean Text
    text_content = " ".join(text_content.split())
    
    # Remove Special Characters While Preserving Words
    text_content = re.sub(r'(\b\w*)\W+(\w*\b)', r'\1 \2', text_content)
    text_content = " ".join(text_content.split())
    
    return text_content

#### Extract Information from PDFs

In [5]:
# Function to Extract Course Name and Assignment Name from Filename
def extract_names(file_name):
    """Extract course name and assignment name from filename."""
    parts = filename.split('_')
    if len(parts) >= 3:
        if filename.startswith("Marketing_Analytics"):
            course_name = "Marketing Analytics"
            remaining_parts = parts[2:]
        elif filename.startswith("Statistical_Analysis"):
            course_name = "Statistical Analysis"
            remaining_parts = parts[2:]
        elif filename.startswith("Introduction_to_Art_History"):
            course_name = "Introduction to Art History"
            remaining_parts = parts[4:]
        else:
            course_name = "Unknown Course"
            remaining_parts = parts
        assignment_name = parts[-2] + ' ' + parts[-1].split('.')[0]
        return course_name, assignment_name
    else:
        return None, None

In [6]:
filenames = [
    'Introduction_to_Art_History_Week_1_Assignment_1.pdf',
    'Introduction_to_Art_History_Week_2_Assignment_2.pdf',
    'Introduction_to_Art_History_Week_3_Assignment_3.pdf',
    'Marketing_Analytics_Week_1_Assignment_1.pdf',
    'Marketing_Analytics_Week_2_Assignment_2.pdf',
    'Marketing_Analytics_Week_3_Assignment_3.pdf',
    'Marketing_Analytics_Week_4_Assignment_4.pdf',
    'Statistical_Analysis_Week_1_Assignment_1.pdf',
    'Statistical_Analysis_Week_2_Assignment_2.pdf',
    'Statistical_Analysis_Week_3_Assignment_3.pdf',
    'Statistical_Analysis_Week_4_Assignment_4.pdf'
        ]

In [7]:
docs = []
course_names = []
assignment_names = []

# List and Process Specified Files in the Folder
storage_client = storage.Client()
for filename in filenames:
    blob_name = f'{folder_name}/{filename}'
    try:
        pdf_content = load_pdf_from_gcs(bucket_name, blob_name)
        course_name, assignment_name = extract_names(filename)
        if course_name and assignment_name:
            docs.append(pdf_content)
            course_names.append(course_name)
            assignment_names.append(assignment_name)
            print(f'Successfully Loaded and Cleaned: {filename}')
    except Exception as e:
        print(f'Error Loading {filename}: {e}')

# Create DataFrame
data = {
    'Assignment Name': assignment_names,
    'Course Name': course_names,
    'Content': docs
}

df = pd.DataFrame(data)
df.head()

Successfully Loaded and Cleaned: Introduction_to_Art_History_Week_1_Assignment_1.pdf
Successfully Loaded and Cleaned: Introduction_to_Art_History_Week_2_Assignment_2.pdf
Successfully Loaded and Cleaned: Introduction_to_Art_History_Week_3_Assignment_3.pdf
Successfully Loaded and Cleaned: Marketing_Analytics_Week_1_Assignment_1.pdf
Successfully Loaded and Cleaned: Marketing_Analytics_Week_2_Assignment_2.pdf
Successfully Loaded and Cleaned: Marketing_Analytics_Week_3_Assignment_3.pdf
Successfully Loaded and Cleaned: Marketing_Analytics_Week_4_Assignment_4.pdf
Successfully Loaded and Cleaned: Statistical_Analysis_Week_1_Assignment_1.pdf
Successfully Loaded and Cleaned: Statistical_Analysis_Week_2_Assignment_2.pdf
Successfully Loaded and Cleaned: Statistical_Analysis_Week_3_Assignment_3.pdf
Successfully Loaded and Cleaned: Statistical_Analysis_Week_4_Assignment_4.pdf


Unnamed: 0,Assignment Name,Course Name,Content
0,Assignment 1,Introduction to Art History,Introduc on to Art History Assignment 1 Early ...
1,Assignment 2,Introduction to Art History,Introduc on to Art History Assignment 2 Classi...
2,Assignment 3,Introduction to Art History,Introduc on to Art History Assignment 3 The Mi...
3,Assignment 1,Marketing Analytics,Marketing Analytics Assignment 1 Introduc on W...
4,Assignment 2,Marketing Analytics,Marketing Analytics Assignment 2 Introduc on W...


In [8]:
# Preview of Data in Content Column
df['Content'].iloc[0]

'Introduc on to Art History Assignment 1 Early Civiliza ons and their Art Explore the art of early civilizations such as Mesopotamia Egypt and the Indus Valley Discuss the signi icance of art in these cultures and how it re lects their beliefs values and social structures Ques ons 1 Describe the key features of Mesopotamian art How did it re lect their religious beliefs 2 What are the main characteristics of Egyptian art Discuss the role of art in Egyptian burial practices 3 Compare and contrast the art of the Indus Valley with that of Mesopotamia and Egypt.'

### 1.2 Generate Data ( user info )

In [9]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import vertexai
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, ChatSession, Part
import vertexai.preview.generative_models as generative_models

In [10]:
# Define conditions and corresponding assignment types
conditions = [
    df['Course Name'] == 'Introduction to Art History',
    df['Course Name'] == 'Statistical Analysis',  
    df['Course Name'] == 'Marketing Analytics'
]

choices = ['Essay', 'Problem Set', 'Coding']
# Create new column 
df['Assignment Type'] = np.select(conditions, choices, default=np.nan)
df

Unnamed: 0,Assignment Name,Course Name,Content,Assignment Type
0,Assignment 1,Introduction to Art History,Introduc on to Art History Assignment 1 Early ...,Essay
1,Assignment 2,Introduction to Art History,Introduc on to Art History Assignment 2 Classi...,Essay
2,Assignment 3,Introduction to Art History,Introduc on to Art History Assignment 3 The Mi...,Essay
3,Assignment 1,Marketing Analytics,Marketing Analytics Assignment 1 Introduc on W...,Coding
4,Assignment 2,Marketing Analytics,Marketing Analytics Assignment 2 Introduc on W...,Coding
5,Assignment 3,Marketing Analytics,Marketing Analytics Assignment 3 Introduc on W...,Coding
6,Assignment 4,Marketing Analytics,Marketing Analytics Assignment 4 Introduc on W...,Coding
7,Assignment 1,Statistical Analysis,Statistical Analysis Assignment 1 Objec ve Und...,Problem Set
8,Assignment 2,Statistical Analysis,Statistical Analysis Assignment 2 Objec ve Exp...,Problem Set
9,Assignment 3,Statistical Analysis,Statistical Analysis Assignment 3 Objec ve Lea...,Problem Set


In [11]:
len(df)

11

In [12]:
df['Difficulty_Level'] = np.nan
df

Unnamed: 0,Assignment Name,Course Name,Content,Assignment Type,Difficulty_Level
0,Assignment 1,Introduction to Art History,Introduc on to Art History Assignment 1 Early ...,Essay,
1,Assignment 2,Introduction to Art History,Introduc on to Art History Assignment 2 Classi...,Essay,
2,Assignment 3,Introduction to Art History,Introduc on to Art History Assignment 3 The Mi...,Essay,
3,Assignment 1,Marketing Analytics,Marketing Analytics Assignment 1 Introduc on W...,Coding,
4,Assignment 2,Marketing Analytics,Marketing Analytics Assignment 2 Introduc on W...,Coding,
5,Assignment 3,Marketing Analytics,Marketing Analytics Assignment 3 Introduc on W...,Coding,
6,Assignment 4,Marketing Analytics,Marketing Analytics Assignment 4 Introduc on W...,Coding,
7,Assignment 1,Statistical Analysis,Statistical Analysis Assignment 1 Objec ve Und...,Problem Set,
8,Assignment 2,Statistical Analysis,Statistical Analysis Assignment 2 Objec ve Exp...,Problem Set,
9,Assignment 3,Statistical Analysis,Statistical Analysis Assignment 3 Objec ve Lea...,Problem Set,


# LLM to generate assignment difficulty level

In [15]:
model = GenerativeModel("gemini-1.0-pro-002")

###  Assignment Difficulty Level

In [16]:
def get_llm_response(input_text, temperature=0.1):
    responses = model.generate_content(
        [input_text],
        generation_config={
            "max_output_tokens": 2048,
            "temperature": temperature,
            "top_p": 1
        },
        stream=True,
    )
    
    full_response = ""
    for response in responses:
        full_response += response.text
        print(response.text, end="")
    
    return full_response


def process_assignments(df):
    # Group assignments by course
    grouped_assignments = df.groupby('Course Name')
    
    all_difficulties = {}
    
    for course, group in grouped_assignments:
        assignments = group['Content'].tolist()
        assignment_names = group['Assignment Name'].tolist()
        
        # Create a list of assignment descriptions
        assignment_descriptions = [f"{name}: {content}" for name, content in zip(assignment_names, assignments)]
        
        prompt = f"""
        For the course '{course}', evaluate the difficulty level of each assignment based on the following descriptions:

        {'. '.join(assignment_descriptions)}

        Assign a difficulty level (Easy, Medium, or Hard) to each assignment. Here is an example of how to assign difficulty level to each assignment:
        
        This is an example of an easy assignment: 
        
        Assignment: Introduction to Art History - Easy Level

        Title: Understanding Renaissance Art
        Objective:  
        Introduce students to Renaissance art by analyzing a famous artwork.

        Instructions:
        1. Choose an Artwork: Select one of the following:
           - "Mona Lisa" by Leonardo da Vinci
           - "The Last Supper" by Leonardo da Vinci
           - "The School of Athens" by Raphael
           - "The Birth of Venus" by Sandro Botticelli

        2. Research: Use at least two credible sources for information.

        3. Write a 500-word Essay: Include:
           - Introduction: Artist, title, and creation date.
           - Description: Subject matter, composition, color, techniques.
           - Historical Context: Significance and reflection of Renaissance values.
           - Personal Reflection: Personal thoughts and feelings about the artwork.

        4. Submission: Submit as a PDF via Canvas by [due date].
        
        Here is another example for marketing analytics with difficulty level of hard :
        Assignment: Marketing Analytics - Hard Level

        Title: Advanced Customer Segmentation Analysis

        Objective:  Conduct a detailed customer segmentation analysis using advanced clustering techniques to identify distinct customer groups and provide strategic marketing insights.

        Instructions:

        1. Dataset: Use the provided dataset with customer purchase history, demographics, and engagement metrics.

        2. Data Preprocessing: Clean and preprocess the data, handle missing values, and normalize the data.

        3. Clustering Analysis: Apply at least two clustering algorithms (e.g., K-Means, Hierarchical Clustering), and determine the optimal number of clusters using methods like the Elbow Method and Silhouette Score.

        4. Segmentation Analysis:Analyze and interpret the resulting clusters, profiling each segment based on key attributes.

        5. Strategic Insights: Provide actionable marketing strategies and personalized campaigns for each segment.

        6. Report (1500 words max): Include an introduction, methodology, results with visualizations, and recommendations. Submit the report and code/script as a ZIP file via Canvas by [due date].

        
        Format your response as follows:
        * **Assignment X:** Difficulty

        Only provide the difficulty levels, no additional explanation.
        """
        
        llm_response = get_llm_response(prompt)
        
        # Check if the response is valid
        if not llm_response:
            print(f"Warning: No valid response received for course '{course}'. Skipping.")
            continue
        
        # Parse the LLM response
        difficulty_map = {'Easy': 1, 'Medium': 2, 'Hard': 3}
        course_difficulties = {}
        
        for line in llm_response.split('\n'):
            line = line.strip()
            match = re.search(r'\*\*Assignment (\d+):\*\* (\w+)', line)
            if match:
                assignment_num, difficulty = match.groups()
                assignment_name = f"Assignment {assignment_num}"
                course_difficulties[assignment_name] = difficulty_map.get(difficulty.strip(), float('nan'))
        
        all_difficulties[course] = course_difficulties
        print(f"Parsed difficulties for {course}:", course_difficulties)  # Debugging line
    
    # Apply difficulties to the DataFrame
    df['Difficulty_Level'] = df.apply(
        lambda row: all_difficulties.get(row['Course Name'], {}).get(row['Assignment Name'], float('nan')),
        axis=1
    )
    
    return df

# Use the function
df = process_assignments(df)
print(df[['Course Name', 'Assignment Name', 'Difficulty_Level']])


## Difficulty Levels for Introduction to Art History Assignments:

**Assignment 1:** Medium
**Assignment 2:** Medium
**Assignment 3:** Hard 
Parsed difficulties for Introduction to Art History: {'Assignment 1': 2, 'Assignment 2': 2, 'Assignment 3': 3}
## Difficulty Levels for Marketing Analytics Assignments:

* **Assignment 1:** Medium
* **Assignment 2:** Hard
* **Assignment 3:** Hard
* **Assignment 4:** Hard 
Parsed difficulties for Marketing Analytics: {'Assignment 1': 2, 'Assignment 2': 3, 'Assignment 3': 3, 'Assignment 4': 3}
## Difficulty Levels for Statistical Analysis Assignments:

* **Assignment 1:** Easy
* **Assignment 2:** Medium
* **Assignment 3:** Hard
* **Assignment 4:** Medium 
Parsed difficulties for Statistical Analysis: {'Assignment 1': 1, 'Assignment 2': 2, 'Assignment 3': 3, 'Assignment 4': 2}
                    Course Name Assignment Name  Difficulty_Level
0   Introduction to Art History    Assignment 1                 2
1   Introduction to Art History    Assignmen

In [17]:
df

Unnamed: 0,Assignment Name,Course Name,Content,Assignment Type,Difficulty_Level
0,Assignment 1,Introduction to Art History,Introduc on to Art History Assignment 1 Early ...,Essay,2
1,Assignment 2,Introduction to Art History,Introduc on to Art History Assignment 2 Classi...,Essay,2
2,Assignment 3,Introduction to Art History,Introduc on to Art History Assignment 3 The Mi...,Essay,3
3,Assignment 1,Marketing Analytics,Marketing Analytics Assignment 1 Introduc on W...,Coding,2
4,Assignment 2,Marketing Analytics,Marketing Analytics Assignment 2 Introduc on W...,Coding,3
5,Assignment 3,Marketing Analytics,Marketing Analytics Assignment 3 Introduc on W...,Coding,3
6,Assignment 4,Marketing Analytics,Marketing Analytics Assignment 4 Introduc on W...,Coding,3
7,Assignment 1,Statistical Analysis,Statistical Analysis Assignment 1 Objec ve Und...,Problem Set,1
8,Assignment 2,Statistical Analysis,Statistical Analysis Assignment 2 Objec ve Exp...,Problem Set,2
9,Assignment 3,Statistical Analysis,Statistical Analysis Assignment 3 Objec ve Lea...,Problem Set,3


### Predicting Time with LLM

In [18]:
df.iloc[1]

Assignment Name                                          Assignment 2
Course Name                               Introduction to Art History
Content             Introduc on to Art History Assignment 2 Classi...
Assignment Type                                                 Essay
Difficulty_Level                                                    2
Name: 1, dtype: object

In [19]:
def get_llm_response(prompt, temperature=0.1):
    try:
        responses = model.generate_content(
            [prompt],
            generation_config={
                "max_output_tokens": 2048,
                "temperature": temperature,
                "top_p": 1
            },
            stream=True,
        )
        
        full_response = ""
        for response in responses:
            if hasattr(response, 'text'):
                full_response += response.text
                print(response.text, end="")
            else:
                print(f"Unexpected response format: {response}")
        
        if not full_response:
            print("Warning: Empty response from LLM")
        
        return full_response.strip() if full_response else None
    except Exception as e:
        print(f"Error in get_llm_response: {e}")
        return None

In [20]:
def estimate_completion_time(row, llm_function):
    prompt = f"""
    Based on the following assignment details, estimate the time (in hours) it would take an average student to complete the assignment. Restrict time estimation for each assignment to a maximum of 8 hours and do not go above the time limit. Be very brief in your response and briefly explain the reasons. 

    Assignment Name: {row['Assignment Name']}
    Course: {row['Course Name']}
    Content: {row['Content']}
    Assignment Type: {row['Assignment Type']}
    Difficulty Level: {row['Difficulty_Level']} (on a scale of 1-3, where 3 is most difficult)
    
    Please provide your estimate as a single number representing hours. For example: 5.5
    """
    
    print(f"Sending prompt for assignment: {row['Assignment Name']}")
    response = llm_function(prompt)
    print(f"Received response: {response}")
    
    if response is None:
        print(f"No response received from LLM for assignment: {row['Assignment Name']}")
        return float('nan')
    
    try:
        # Extract the first number from the response
        import re
        numbers = re.findall(r'\d+(?:\.\d+)?', response)
        if numbers:
            estimated_time = float(numbers[0])
            return estimated_time
        else:
            print(f"No numeric estimate found in LLM response for assignment: {row['Assignment Name']}")
            return float('nan')
    except ValueError as e:
        print(f"Error parsing time estimate from LLM response for assignment: {row['Assignment Name']}. Error: {e}")
        return float('nan')

In [21]:
df['Time_Completion'] = df.apply(estimate_completion_time, axis=1, args=(get_llm_response,))

Sending prompt for assignment: Assignment 1
## Estimated Time: 6.5 hours

**Explanation:**

* **Research:** 2.5 hours 
    * Researching the key features and religious beliefs reflected in Mesopotamian art. 
    * Researching the main characteristics and role of art in Egyptian burial practices.
    * Researching the art of the Indus Valley and comparing/contrasting it with Mesopotamian and Egyptian art. 
* **Writing:** 3 hours
    * Writing a well-structured essay with a clear introduction, body paragraphs addressing each question, and a conclusion. 
    * Ensuring proper citations and referencing of sources. 
* **Review and Editing:** 1 hour
    * Proofreading for grammar, spelling, and clarity. 
    * Ensuring the essay adheres to the assignment guidelines and rubric. 

**Note:** This is an estimated time based on the average student. Individual times may vary depending on research skills, writing speed, and familiarity with the subject matter. 
Received response: ## Estimated Time:

In [22]:
df

Unnamed: 0,Assignment Name,Course Name,Content,Assignment Type,Difficulty_Level,Time_Completion
0,Assignment 1,Introduction to Art History,Introduc on to Art History Assignment 1 Early ...,Essay,2,6.5
1,Assignment 2,Introduction to Art History,Introduc on to Art History Assignment 2 Classi...,Essay,2,6.5
2,Assignment 3,Introduction to Art History,Introduc on to Art History Assignment 3 The Mi...,Essay,3,3.0
3,Assignment 1,Marketing Analytics,Marketing Analytics Assignment 1 Introduc on W...,Coding,2,1.0
4,Assignment 2,Marketing Analytics,Marketing Analytics Assignment 2 Introduc on W...,Coding,3,2.0
5,Assignment 3,Marketing Analytics,Marketing Analytics Assignment 3 Introduc on W...,Coding,3,3.0
6,Assignment 4,Marketing Analytics,Marketing Analytics Assignment 4 Introduc on W...,Coding,3,4.0
7,Assignment 1,Statistical Analysis,Statistical Analysis Assignment 1 Objec ve Und...,Problem Set,1,4.5
8,Assignment 2,Statistical Analysis,Statistical Analysis Assignment 2 Objec ve Exp...,Problem Set,2,6.5
9,Assignment 3,Statistical Analysis,Statistical Analysis Assignment 3 Objec ve Lea...,Problem Set,3,7.5


In [23]:
# write to pickle

import pandas as pd

time_completion = df[['Time_Completion']]

file_path = './time_completion.pkl'
time_completion.to_pickle(file_path)

In [24]:
df_new = pd.read_pickle("time_completion.pkl")

In [29]:
df["Time_Completion"][7]

4.5