**Step 1: Install Dependencies**

In [None]:
!pip install pytesseract autopep8 ffmpeg-python reportlab
!sudo apt install tesseract-ocr

**Step 2: Import Necessary Libraries**

In [None]:
import cv2
import pytesseract
import numpy as np
from PIL import Image
from google.colab.patches import cv2_imshow  # For displaying images in Colab
import autopep8
import os
import re
import ast
import pandas as pd
from pylint.lint import Run
import tempfile
import subprocess

**Step 3: Uploading Video File and converting it into necessary format**

In [None]:
from google.colab import files
uploaded = files.upload()
video_path = list(uploaded.keys())[0]  # Get uploaded file name
print(f"✅ File uploaded: {video_path}")


if video_path.endswith(".mov"):
    converted_path = "output.mp4"
    !ffmpeg -i "$video_path" -vcodec libx264 "$converted_path"
    video_path = converted_path
    print(f"Converted {video_path} to MP4 format.")

**Step 4: Extracting Frames from Video**

In [None]:
output_dir = "frames"
os.makedirs(output_dir, exist_ok=True)
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS)) or 1
frame_count = 0
saved_frames = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if frame_count % fps == 0:  # Extract 1 frame per second
        frame_path = os.path.join(output_dir, f"frame_{frame_count}.jpg")
        cv2.imwrite(frame_path, frame)
        saved_frames += 1
    frame_count += 1

cap.release()
print(f"Extracted {saved_frames} frames to '{output_dir}'")

### **Step 5: Preprocessing Frames for better OCR**
(converting the frames in gray scale)

In [None]:
processed_dir = "processed_frames"
os.makedirs(processed_dir, exist_ok=True)

for frame in os.listdir(output_dir):
    img = cv2.imread(os.path.join(output_dir, frame))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    cv2.imwrite(os.path.join(processed_dir, frame), binary)

print("Preprocessing complete! Processed frames saved in 'processed_frames'")

### **Step 6: Performing OCR on Processed Frames**

In [None]:
ocr_results = {}
for frame in sorted(os.listdir(processed_dir)):
    img_path = os.path.join(processed_dir, frame)
    text = pytesseract.image_to_string(img_path, config="--psm 6")
    ocr_results[frame] = text

print("✅ OCR complete! Sample output:")
print(ocr_results[list(ocr_results.keys())[0]])

### **Step 7: Save Extracted Code from All Frames**

In [None]:
with open("extracted_code.txt", "w") as f:
    for frame, code in ocr_results.items():
        f.write(f"### {frame} ###\n{code}\n\n")

files.download("extracted_code.txt")

### **Step 8: Clean and Deduplicate Extracted Code**

In [None]:
def clean_and_deduplicate_code(code_dict):
    all_lines = []
    for frame, code in code_dict.items():
        lines = code.split('\n')
        cleaned_lines = [line for line in lines if line.strip() and not line.isspace()]
        all_lines.extend(cleaned_lines)

    # Remove near-duplicates using difflib
    unique_lines = []
    for line in all_lines:
        if not any(difflib.SequenceMatcher(None, line, existing).ratio() > 0.9 for existing in unique_lines):
            unique_lines.append(line)

    formatted_code = "\n".join(unique_lines)
    return autopep8.fix_code(formatted_code)

final_clean_code = clean_and_deduplicate_code(ocr_results)

with open("final_clean_code.py", "w") as f:
    f.write(final_clean_code)

files.download("final_clean_code.py")
print("Final cleaned and deduplicated code saved!")

### **Step 9: Generate a PDF Report**

In [None]:
pdf_path = "coding_analysis.pdf"
c = canvas.Canvas(pdf_path, pagesize=letter)
c.setFont("Helvetica", 10)

y = 750
c.drawString(30, y, "📊 Coding Analysis Report")
y -= 20
c.drawString(30, y, "Extracted Code:")
y -= 20

for frame, code in ocr_results.items():
    if y < 50:
        c.showPage()
        c.setFont("Helvetica", 10)
        y = 750
    c.drawString(30, y, f"Frame: {frame}")
    y -= 15
    c.drawString(30, y, code[:200] + "...")
    y -= 30

c.save()
files.download(pdf_path)
print("PDF Report ready for download!")


### **Step 10: Make Frames Downloadable**

In [None]:
shutil.make_archive("extracted_frames", "zip", output_dir)
shutil.make_archive("processed_frames", "zip", processed_dir)
files.download("extracted_frames.zip")
files.download("processed_frames.zip")
print("All files ready for download!")