In [16]:
from pathlib import Path
import ollama
from tqdm.notebook import tqdm
from timeit import default_timer as timer
import concurrent.futures
import humanize

In [None]:
# Parameters to configure
config = {
    "directory": r"kt",  # Codebase path
    "model": "llama3.2",                        # Model of your choice
    "output": "code_analysis_results",         # Output directory
    "max_size": 5000,                          # Max characters per file to analyze
    "file_types": [                            # File extensions to focus on
       ".py", ".java", ".kt", ".js", ".ts", ".c", 
        ".cpp", ".h", ".cs", ".html", ".css", ".xml", ".resx"
    ],
    "num_workers": 2,                          
    "skip_existing": True,                    
    "sample_size": None                    
}

In [18]:
# Going through the codebase
def collect_code_files(directory_path, file_extensions, max_file_size=5000, sample_size=None):
    """Collect code files more efficiently"""
    start_time = timer()
    
    print(f"Scanning for files with extensions: {', '.join(file_extensions)}")
    
    all_files = []
    for ext in file_extensions:
        all_files.extend(list(Path(directory_path).glob(f'**/*{ext}')))
    
    total_files = len(all_files)
    print(f"Found {total_files} matching files")
    
    if sample_size and sample_size < total_files:
        import random
        all_files = random.sample(all_files, sample_size)
        print(f"Sampling {sample_size} files for analysis")
    
    code_files = []
    skipped_files = 0
    
    for file_path in tqdm(all_files, desc="Collecting files"):
        try:
            file_size = file_path.stat().st_size
            if file_size > max_file_size * 10:
                skipped_files += 1
                continue
                
            with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                code = f.read()
                if len(code) > max_file_size:
                    code = code[:max_file_size] + "\n\n// [TRUNCATED - file too large]"
                
                if code.strip():
                    code_files.append((str(file_path), code))
        except Exception:
            skipped_files += 1
    
    elapsed = timer() - start_time
    print(f"Successfully loaded {len(code_files)} files in {elapsed:.2f} seconds")
    print(f"Skipped {skipped_files} files due to size or errors")
    
    return code_files

In [19]:
# Load persona from a text file
with open("PersonaC.txt", "r", encoding="utf-8") as persona_file:
    ux_persona = persona_file.read()

# defined prompt with ollama
def analyze_code_with_ollama(model_name, file_path, code):
    """Analyze a single code file using Ollama and return the analysis"""
    
    prompt = f"""
You are {ux_persona}
As an expert code reviewer, analyze this code from `{file_path}` evaluate the following case: This case investigates the compatibility of Bitwarden with Android 14. 
Animation stuttering and functionality failures were identified as issues on the latest OS version. Operate a codebase analysis to trace the potential cause of these issues.
Important Instructions:
- If the file is not related to the objective, skip it and don’t write anything
-only focus on the issue, no feedback on reliability, usability etc..

CODE:
{code}
"""
    try:
        response = ollama.generate(model=model_name, prompt=prompt)
        return response['response'].strip()
    except Exception as e:
        return f"Error analyzing code: {str(e)}"

In [20]:
# Analyze a single file and save result
def process_single_file(file_data, config, output_dir):
    file_path, code = file_data
    result_path = Path(output_dir) / (Path(file_path).name + ".md")

    if config["skip_existing"] and result_path.exists():
        return {"skipped": True}

    analysis = analyze_code_with_ollama(config["model"], file_path, code)
    
    try:
        result_path.write_text(analysis, encoding="utf-8")
        return {"skipped": False}
    except Exception as e:
        return {"skipped": True, "error": str(e)}

In [21]:
# Process entire codebase
def process_codebase(config):
    overall_start_time = timer()
    output_dir = Path(config["output"])
    output_dir.mkdir(exist_ok=True, parents=True)

    code_files = collect_code_files(
        config["directory"], 
        config["file_types"], 
        config["max_size"],
        config["sample_size"]
    )
    
    file_count = len(code_files)
    analysis_results = []
    
    if file_count == 0:
        print("No files to analyze!")
        return 0, output_dir
    
    print(f"\nStarting analysis of {file_count} files with {config['num_workers']} workers")
    analysis_start_time = timer()
    
    print("Analyzing first file to estimate total time...")
    sample_start = timer()
    first_result = process_single_file(code_files[0], config, output_dir)
    analysis_results.append(first_result)
    sample_time = timer() - sample_start

    estimated_total = sample_time * (file_count - 1) / config["num_workers"]
    print(f"Estimated time for remaining files: {estimated_total:.2f} seconds ({estimated_total/60:.2f} minutes)")

    remaining_files = code_files[1:]
    processed = 1
    skipped = 1 if first_result.get("skipped", False) else 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=config["num_workers"]) as executor:
        future_to_file = {
            executor.submit(process_single_file, file_data, config, output_dir): file_data 
            for file_data in remaining_files
        }
        
        with tqdm(total=len(remaining_files), desc="Analyzing files") as pbar:
            for future in concurrent.futures.as_completed(future_to_file):
                result = future.result()
                analysis_results.append(result)
                processed += 1
                if result.get("skipped", False):
                    skipped += 1
                pbar.update(1)
                
                if processed % 5 == 0:
                    elapsed = timer() - analysis_start_time
                    rate = processed / elapsed
                    remaining = (file_count - processed) / rate
                    pbar.set_postfix({"Remain": humanize.naturaldelta(remaining), "Skipped": skipped})

    analysis_time = timer() - analysis_start_time
    total_time = timer() - overall_start_time

    print(f"\nAnalysis running time:")
    print(f"- Total  time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
    print(f"- Analysis time: {analysis_time:.2f} seconds ({analysis_time/60:.2f} minutes)")
    print(f"- TotalFiles analyzed: {file_count}")
    print(f"- TotalFiles skipped: {skipped}")
    if file_count - skipped > 0:
        print(f"- Average time per file: {analysis_time/(file_count - skipped):.2f} seconds")
    else:
        print("- No files analyzed .")

    return file_count, output_dir

In [None]:
# Summarize all .md files in one report
def summarize_analysis_reports(output_dir, model_name="mistral", summary_filename="summary_report.md"):
    md_files = list(Path(output_dir).rglob("*.md"))
    if not md_files:
        print("Fail to summarize.")
        return

    combined_content = ""
    for md_file in md_files:
        if md_file.name == summary_filename:
            continue
        try:
            content = md_file.read_text(encoding="utf-8")
            combined_content += f"\n\n### File: {md_file.name}\n\n{content}"
        except Exception as e:
            print(f"Error reading {md_file}: {e}")

    summary_prompt = f"""
You are a senior software quality analyst.

Based on the following code analysis reports from multiple files, create a high-level summary.
Important Instructions:

Highlight:
- strengths
- Repeated weaknesses
- Suggestions for overall improvement
- if you found practical code example mentionned them (like their line and their location)
-Avoid repeating file-level details unless relevant to the overall theme.
### CODE ANALYSIS REPORTS:
{combined_content}
"""

    try:
        response = ollama.generate(model=model_name, prompt=summary_prompt)
        summary = response["response"].strip()
        summary_path = Path(output_dir) / summary_filename
        summary_path.write_text(summary, encoding="utf-8")
        print(f"\nSummary written to {summary_path}")
    except Exception as e:
        print(f"Error generating summary: {e}")

# Run analysis
def run_analysis():
    overall_start = timer()
    
    print(f"Starting code quality analysis on: {config['directory']}")
    print(f"Using Ollama model: {config['model']}")
    
    file_count, output_dir = process_codebase(config)
    
    summarize_analysis_reports(output_dir, model_name=config["model"])
    
    elapsed = timer() - overall_start
    minutes, seconds = divmod(elapsed, 60)
    
    print(f"\nAnalysis complete in {int(minutes)} minutes, {seconds:.2f} seconds!")
    print(f"Analyzed {file_count} files")
    print(f"Results saved to: {output_dir}")

run_analysis()

Starting code quality analysis on: kt
Using Ollama model: llama3.2
Scanning for files with extensions: .py, .java, .kt, .js, .ts, .c, .cpp, .h, .cs, .html, .css, .xml, .resx
Found 13 matching files


Collecting files:   0%|          | 0/13 [00:00<?, ?it/s]

Successfully loaded 13 files in 0.03 seconds
Skipped 0 files due to size or errors

Starting analysis of 13 files with 2 workers
Analyzing first file to estimate total time...
Estimated time for remaining files: 128.32 seconds (2.14 minutes)


Analyzing files:   0%|          | 0/12 [00:00<?, ?it/s]