In [None]:
!pip install -q google-generativeai pandas tqdm
import os
import google.generativeai as genai
import pandas as pd
from tqdm import tqdm
import re
import time
import random

# Enter your API key in place of the ***
os.environ["GEMINI_API_KEY"] = "********************"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Load your CSV (adjust filename/path)
df = pd.read_csv("/content/DL_Energy_Patterns.csv")

# Preview relevant columns
df = df[['PostId', 'QUESTION', 'CHATGPT_ANSWER', 'SO_ANSWER']]
df.head()

prompt_template = """
Objective:
You are an expert software sustainability analyst evaluating code solutions for deep learning energy-efficiency. Your task is to analyze two solutions (Human Stack Overflow vs AI ChatGPT) using the established 8 energy-efficiency patterns with numerical scoring.

Task Description:
Evaluate both code solutions against the 8 core energy-efficiency patterns using a 1-5 scoring system. Provide a concise explanation highlighting key differences and evidence.

Scoring Framework (1-5 scale):
1: Poor - Pattern not implemented or severely deficient
2: Below Average - Basic implementation with significant issues
3: Average - Adequate implementation meeting basic requirements
4: Good - Solid implementation with clear energy benefits
5: Excellent - Exceptional implementation demonstrating best practices

Patterns and Associated Tactics:

1. PRE-TRAINED MODEL UTILIZATION
   - Measurable Tactics: Transfer Learning (T16), Knowledge Distillation (T17)
   - Evidence: Pre-trained model loading, fine-tuning, teacher-student architectures

2. CHECKPOINT MANAGEMENT
   - Measurable Tactics: Checkpoint Usage (T19)
   - Evidence: Model saving/loading strategies, resume training capabilities

3. MODEL OPTIMIZATION STRATEGIES
   - Measurable Tactics: Reduce Complexity (T8), Enhance Sparsity (T14), Energy-aware Pruning (T15)
   - Evidence: Simpler architectures, regularization, pruning implementations

4. QUANTIZATION TECHNIQUES
   - Measurable Tactics: Input Quantization (T4), Quantization-aware Training (T18)
   - Evidence: Precision reduction, quantization APIs, model optimization

5. EFFICIENT DATA HANDLING
   - Measurable Tactics: Sampling (T1), Remove Redundant Data (T2), Feature Reduction (T3), Minimize Data Referencing (T27)
   - Evidence: Data pipelines, sampling techniques, feature selection, generators

6. MEMORY MANAGEMENT
   - Measurable Tactics: Memory Constraints (T20), Computation Partitioning (T22)
   - Evidence: Device management, memory optimization, model partitioning

7. ALGORITHM & COMPUTATION OPTIMIZATION
   - Measurable Tactics: Energy-efficient Algorithms (T6), Lightweight Alternatives (T7), Dynamic Parameter Adaptation (T10), Built-in Library Functions (T11)
   - Evidence: Model architecture choices, adaptive learning, optimized operations

8. MODEL MAINTENANCE & ADAPTATION
   - Measurable Tactics: Graph Substitution (T13), Informed Adaptation (T28), Retrain When Needed (T29)
   - Evidence: Computation graph optimization, adaptive strategies

Evaluation Process:
1. Evaluate each of the 8 patterns independently for both solutions
2. Assign scores 1-5 based on implementation quality and evidence
3. Consider both explicit code implementations AND textual recommendations
4. For Human answers: Give credit for clear textual suggestions even without full code
5. For AI answers: Focus on actual code implementations and practical suggestions
6. Identify specific tactics (T1-T30) observed in the code or text
7. Calculate average scores for each solution
8. Determine winner based on higher average score
9. Provide concise analysis highlighting key differentiating factors

Important Scoring Guidelines:
- Code Evidence: Highest priority - explicit implementations get higher scores
- Textual Recommendations: Medium priority - clear suggestions without code get moderate scores
- Vague Mentions: Low priority - general statements without specifics get low scores
- Human Answers: Often contain valuable expert suggestions in natural language
- AI Answers: Typically provide complete code implementations

Tie Consideration:
- Primary determination is based on numerical scores (higher average wins)
- Objective tie condition: Only when score difference is exactly 0.0
- For minimal differences (≤0.25), carefully evaluate if solutions demonstrate truly complementary strengths that make them equally valuable
- Consider tie ONLY when both solutions excel in different but equally important aspects that balance each other
- Avoid automatic ties for small differences - the higher score should generally win unless there are compelling complementary factors

Reporting Instructions:
Return ONLY a JSON object with this exact structure:
{{
  "human_pattern_1_score": 1-5,
  "human_pattern_2_score": 1-5,
  "human_pattern_3_score": 1-5,
  "human_pattern_4_score": 1-5,
  "human_pattern_5_score": 1-5,
  "human_pattern_6_score": 1-5,
  "human_pattern_7_score": 1-5,
  "human_pattern_8_score": 1-5,

  "ai_pattern_1_score": 1-5,
  "ai_pattern_2_score": 1-5,
  "ai_pattern_3_score": 1-5,
  "ai_pattern_4_score": 1-5,
  "ai_pattern_5_score": 1-5,
  "ai_pattern_6_score": 1-5,
  "ai_pattern_7_score": 1-5,
  "ai_pattern_8_score": 1-5,

  "human_average_score": "calculated average",
  "ai_average_score": "calculated average",
  "score_difference": "difference in averages",
  "winner": "Human" or "AI" or "Tie",
  "analysis_explanation": "Brief explanation for your analysis, highlighting key differentiating factors in human and AI solutions, citing relevant code snippets. In case of tie, explain compelling complementary strengths that justify equal evaluation despite numerical differences."
}}

Examples:
Example output with justified tie:
{{
  "human_pattern_1_score": 5,
  "human_pattern_2_score": 3,
  "human_pattern_3_score": 4,
  "human_pattern_4_score": 2,
  "human_pattern_5_score": 4,
  "human_pattern_6_score": 3,
  "human_pattern_7_score": 4,
  "human_pattern_8_score": 3,

  "ai_pattern_1_score": 3,
  "ai_pattern_2_score": 4,
  "ai_pattern_3_score": 3,
  "ai_pattern_4_score": 5,
  "ai_pattern_5_score": 3,
  "ai_pattern_6_score": 4,
  "ai_pattern_7_score": 3,
  "ai_pattern_8_score": 5,

  "human_average_score": 3.5,
  "ai_average_score": 3.625,
  "score_difference": 0.125,
  "winner": "Tie",
  "analysis_explanation": "Despite AI's slight numerical advantage (0.125), both solutions demonstrate truly complementary and equally valuable approaches. Human excels in Pre-trained Model Utilization (P1: score 5) with expert-level transfer learning strategies and practical fine-tuning guidance that significantly reduces training energy. AI counters with exceptional Quantization Techniques (P4: score 5) and Model Maintenance (P8: score 5) through advanced optimization implementations. The Human solution provides deep domain expertise that could save substantial computational resources in model development, while AI offers robust technical implementations for runtime efficiency. Given that both approaches address fundamentally different but equally critical aspects of the energy-efficiency pipeline, this evaluation results in a tie."
}}

Example output with clear winner despite small difference:
{{
  "human_pattern_1_score": 4,
  "human_pattern_2_score": 3,
  "human_pattern_3_score": 2,
  "human_pattern_4_score": 1,
  "human_pattern_5_score": 4,
  "human_pattern_6_score": 3,
  "human_pattern_7_score": 3,
  "human_pattern_8_score": 2,

  "ai_pattern_1_score": 3,
  "ai_pattern_2_score": 4,
  "ai_pattern_3_score": 3,
  "ai_pattern_4_score": 4,
  "ai_pattern_5_score": 3,
  "ai_pattern_6_score": 4,
  "ai_pattern_7_score": 4,
  "ai_pattern_8_score": 3,

  "human_average_score": 2.75,
  "ai_average_score": 3.5,
  "score_difference": 0.75,
  "winner": "AI",
  "analysis_explanation": "Both solutions effectively address the core `sess.run()` bottleneck by fetching entire weight matrices at once, improving Memory Management (P6) by reducing overhead and data transfers (Human: `h1_val = sess.run(weights['h1'])`, AI: `weight_values = sess.run(tensor)`). The Human solution provides a solid Algorithm & Computation Optimization (P7) by fixing the data extraction, but still uses manual loops for writing. The AI solution excels here by implementing `np.savetxt(fp, weight_values)`, leveraging highly optimized built-in libraries for superior computational efficiency and streamlined I/O. AI's more complete and robust approach, including support for multiple layers, provides a clearer best practice for energy-efficient weight serialization."
}}

---
Now evaluate the following:

Question:
{question}

Human (Stack Overflow) Answer:
{human_answer}

AI (ChatGPT) Answer:
{ai_answer}
"""

model = genai.GenerativeModel("gemini-2.5-flash")

# Update the pattern_columns list for the simplified output structure
pattern_columns = []

# Add all 8 pattern scores for human and AI
for i in range(1, 9):
    pattern_columns.append(f'human_pattern_{i}_score')
for i in range(1, 9):
    pattern_columns.append(f'ai_pattern_{i}_score')

# Add summary columns
pattern_columns.extend([
    'human_average_score', 'ai_average_score', 'score_difference',
    'winner', 'analysis_explanation'
])

# Initialize empty columns
for col in pattern_columns:
    df[col] = None

def parse_json_response(response_text):
    """Extract JSON from response text and parse it"""
    try:
        # Try to find JSON pattern in the response
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            # Parse the JSON string
            import json
            return json.loads(json_str)
        else:
            print("No JSON found in response")
            return None
    except Exception as e:
        print(f"Error parsing JSON: {e}")
        return None

# Define checkpoint interval and rate limiting parameters
CHECKPOINT_INTERVAL = 20
BASE_DELAY = 2  # Base delay between requests in seconds
MAX_RETRIES = 3  # Maximum number of retries for failed requests
base_output_path = "/content/gemini_final"

# Create a copy to track processed rows
processed_indices = set()

def make_api_request_with_retry(prompt, max_retries=MAX_RETRIES):
    """Make API request with exponential backoff and retries"""
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            return response, True  # Success

        except Exception as e:
            error_msg = str(e)
            print(f"Attempt {attempt + 1} failed: {error_msg}")

            # Check if it's a rate limit error
            if '503' in error_msg or '429' in error_msg or 'quota' in error_msg.lower():
                # Exponential backoff with jitter
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"Rate limit hit. Waiting {wait_time:.2f} seconds before retry...")
                time.sleep(wait_time)
            else:
                # For other errors, wait a shorter time
                wait_time = BASE_DELAY * (attempt + 1)
                print(f"API error. Waiting {wait_time:.2f} seconds before retry...")
                time.sleep(wait_time)

    return None, False  # All retries failed

for idx, row in tqdm(df.iloc[140:].iterrows(), total=len(df)-140):
    # Skip if already processed (in case of restart)
    if pd.notna(df.at[idx, 'human_pattern_1_score']):
        processed_indices.add(idx)
        continue

    question = str(row['QUESTION'])
    human_answer = str(row['SO_ANSWER'])
    ai_answer = str(row['CHATGPT_ANSWER'])

    prompt = prompt_template.format(
        question=question,
        human_answer=human_answer,
        ai_answer=ai_answer
    )

    # Make API request with retry logic
    response, success = make_api_request_with_retry(prompt)

    if success:
        json_data = parse_json_response(response.text)

        if json_data:
            # Update the dataframe with the parsed JSON data
            for key, value in json_data.items():
                if key in df.columns:
                    df.at[idx, key] = value
            processed_indices.add(idx)
            print(f"✓ Successfully processed row {idx}")
        else:
            print(f"✗ Failed to parse JSON for row {idx}")
            # Mark as error but continue
            df.at[idx, 'analysis_explanation'] = "ERROR: Failed to parse response"
    else:
        print(f"✗ All retries failed for row {idx}")
        df.at[idx, 'analysis_explanation'] = "ERROR: API request failed after retries"

    # Add delay between requests to avoid rate limiting
    if idx < len(df) - 1:  # Don't delay after the last request
        delay = BASE_DELAY + random.uniform(0, 1)  # Add jitter
        print(f"Waiting {delay:.2f} seconds before next request...")
        time.sleep(delay)

    # Save checkpoint every CHECKPOINT_INTERVAL rows
    if (idx + 1) % CHECKPOINT_INTERVAL == 0:
        batch_number = (idx + 1) // CHECKPOINT_INTERVAL
        batch_file = f"{base_output_path}_batch_{batch_number}.csv"

        # Get the current batch (rows 0 to idx)
        current_batch = df.iloc[:(idx + 1)]

        print(f"\nSaving batch {batch_number} (rows 0-{idx}) to {batch_file}...")
        current_batch.to_csv(batch_file, index=False)
        print(f"Batch {batch_number} saved! Contains {len(current_batch)} rows.")

# Final saves after all rows are processed
print("\nFinal save...")

# Save the final combined CSV with all rows
final_combined_file = f"{base_output_path}_complete.csv"
df.to_csv(final_combined_file, index=False)
print(f"Complete dataset saved to {final_combined_file}!")

# Also save individual batches for the remaining rows
total_rows = len(df)
if total_rows % CHECKPOINT_INTERVAL != 0:
    final_batch_number = (total_rows // CHECKPOINT_INTERVAL) + 1
    final_batch_file = f"{base_output_path}_batch_{final_batch_number}.csv"
    df.to_csv(final_batch_file, index=False)
    print(f"Final batch {final_batch_number} saved to {final_batch_file}!")

print(f"Analysis complete! Processed {len(processed_indices)} rows total.")

# Display the structured results
print("\nStructured Results:")
print(df[['PostId', 'QUESTION'] + pattern_columns].head())