In [1]:
from PIL import Image
from paddleocr import PaddleOCR
import numpy as np 
import re
from difflib import get_close_matches

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class MatchFactsAnalyzer:
    def __init__(self, image_width: int, image_height: int):
        self.image_width = image_width # size of FIFA window
        self.image_height = image_height # size of FIFA window
        self.res = {}
        self.stat_names = {
            'matchfacts-summary': ['Possession %', 'Shots', 'Expected Goals', 'Passes', 'Tackles', 'Tackles Won', 'Interceptions', 'Saves', 'Fouls Commited', 'Offsides', 'Corners', 'Free Kicks', 'Penalty Kicks', 'Yellow Cards', 'Red Cards'],
            'matchfacts-possession': ["15'", "30'", "45'", "60'", "75'", "90'"],
            'matchfacts-passing': ['Total Passes', 'Completed', 'Intercepted', 'Offside', 'Ground', 'Lob', 'Through', 'Lofted Through', 'Cross', 'Set Pieces', 'Key Pass', 'First Time', 'Pass and Go', 'Down Wing', 'Breakaway']
        }
        # Initialize PaddleOCR instance
        self.ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False)

    def get_boxes(self, image_type: str) -> np.array:
        # TODO: Make image_type an enum
        return self.res[image_type]['rec_boxes']

    def get_texts(self, image_type: str) -> np.array:
        return np.array(self.res[image_type]['rec_texts'], dtype=object)
    
    def get_confs(self, image_type: str) -> np.array:
        return np.array(self.res[image_type]['rec_scores'])
        
    def is_cross_x_mid(self, boxes: np.array) -> np.array:
        # We use the fact that statistics are centered on screen
        return np.logical_and(boxes[:,0] <= self.image_width//2, self.image_width//2 <= boxes[:,2])
    
    def find_closest_opposites_to_target(self, nums: np.array, target: int, tolerance: float):
        # Partition into values on either side of the target
        lower_indices = np.where(nums < target)[0]
        upper_indices = np.where(nums > target)[0]

        if lower_indices.size == 0 or upper_indices.size == 0:
            raise ValueError("Array must contain values both less than and greater than the target.")

        # Sort each by proximity to the target
        lower_sorted = lower_indices[np.argsort(np.abs(nums[lower_indices] - target))]
        upper_sorted = upper_indices[np.argsort(np.abs(nums[upper_indices] - target))]

        # Try all combinations, return first pair within relative tolerance
        for i in lower_sorted:
            for j in upper_sorted:
                dist_i = abs(nums[i] - target)
                dist_j = abs(nums[j] - target)
                rel_diff = abs(dist_i - dist_j) / max(dist_i, dist_j)

                if rel_diff <= tolerance:
                    return i, j

        raise ValueError("No values on opposite sides of target with approximately equal relative distances found.")

    def perform_ocr(self, image_path: str, image_type: str):
        # TODO: make image_type an enum

        # Run OCR inference on an image 
        result = self.ocr.predict(input=image_path)

        # Visualize the results and save the JSON results
        for res in result:
            res.save_to_img("output")
            res.save_to_json("output")
            self.res[image_type] = res

    def get_score(self) -> tuple: 
        # Can use summary image (other images also display score)
        boxes = self.get_boxes('matchfacts-summary')
        texts = self.get_texts('matchfacts-summary')
        
        # We could search every single text, but let's optimize by searching along center of FIFA window
        pattern = re.compile(r'\d+\s*:\s*\d+')
        for text in texts[self.is_cross_x_mid(boxes)]:
            if pattern.match(text):
                return tuple(map(int, text.split(':')))
            
        raise Exception('unable to find score in matchfacts-summary')
    
    def get_stats(self, image_type: str, cutoff=0.9, tolerance=5e-2): # TODO: tune parameters?
        # Need to use specific image
        boxes = self.get_boxes(image_type)
        texts = self.get_texts(image_type)
        stat_names = self.stat_names[image_type]

        # Match expected stat names to actual ones found in image
        matches = []
        texts_cross_x_mid = texts[self.is_cross_x_mid(boxes)]
        for sn in stat_names:
            ms = get_close_matches(word=sn, possibilities=texts_cross_x_mid, n=1, cutoff=cutoff)
            if not ms or len(ms) > 1: # should only be one
                raise Exception(f'unable to match statistic name \'{sn}\' in \'{image_type}\'')
            matches.append(ms[0])

        # Find possible boxes containing stats for each stat name
        is_stat_name = np.isin(texts, matches)
        stat_name_boxes = boxes[is_stat_name]
        stat_name_boxes_y_mid = stat_name_boxes[:,[1,3]].mean(axis=1)
        boxes_above_y_mid = (boxes[:,1][:, np.newaxis] <= stat_name_boxes_y_mid)
        boxes_below_y_mid = (stat_name_boxes_y_mid <= boxes[:,3][:, np.newaxis])
        boxes_cross_y_mid = boxes_above_y_mid & boxes_below_y_mid
        possible_stat_boxes = [np.where(boxes_cross_y_mid[:, j] & ~is_stat_name)[0] for j in range(len(stat_names))]

        # Identify which boxes (referred to as "candidates" below) actually contain stats
        stats = {}
        for i, candidates in enumerate(possible_stat_boxes):
            if len(candidates) < 2:
                raise Exception(f'unable to find two values for statistic: {stat_names[i]}')
            
            # Usually, they stats will be organized like this:
            #           LEFT_TEAM_STAT               STAT_NAME               RIGHT_TEAM_STAT
            # But, there may be some extra boxes like this (which we must ignore):
            #  BOGUS    LEFT_TEAM_STAT    BOGUS      STAT_NAME               RIGHT_TEAM_STAT    BOGUS
            candidates_x_mid = boxes[candidates][:,[0,2]].mean(axis=1)
            left_candidate_idx, right_candidate_idx = self.find_closest_opposites_to_target(nums=candidates_x_mid, target=self.image_width//2, tolerance=tolerance)
            left_text, right_text = texts[candidates[left_candidate_idx]], texts[candidates[right_candidate_idx]]
            left_stat, right_stat = float(left_text.strip('I')), float(right_text.strip('I')) # due to poor OCR, a bar indicating match high can sometimes be interpretted as the character 'I'
            stats[stat_names[i]] = [left_stat, right_stat]
        
        return stats
    
    def get_summary_stats(self) -> dict: 
        return self.get_stats('matchfacts-summary')

    def get_passing_stats(self) -> dict: 
        return self.get_stats('matchfacts-passing')

    def get_possession_stats(self) -> dict: 
        return self.get_stats('matchfacts-possession', cutoff=0.8)
    

In [7]:
img = Image.open('testdata/fifa22-matchfacts-summary.png')
a = MatchFactsAnalyzer(image_width=img.width, image_height=img.height)

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in C:\Users\aislam\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 161.45it/s]
[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in C:\Users\aislam\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 166.82it/s]


In [8]:
a.perform_ocr('testdata/fifa22-matchfacts-summary.png', 'matchfacts-summary')

In [9]:
a.get_score(), a.get_summary_stats()

((5, 2),
 {'Possession %': [52.0, 48.0],
  'Shots': [13.0, 14.0],
  'Expected Goals': [4.7, 4.5],
  'Passes': [106.0, 167.0],
  'Tackles': [39.0, 9.0],
  'Tackles Won': [16.0, 3.0],
  'Interceptions': [16.0, 9.0],
  'Saves': [5.0, 5.0],
  'Fouls Commited': [1.0, 0.0],
  'Offsides': [1.0, 0.0],
  'Corners': [1.0, 2.0],
  'Free Kicks': [0.0, 2.0],
  'Penalty Kicks': [0.0, 0.0],
  'Yellow Cards': [0.0, 0.0],
  'Red Cards': [0.0, 0.0]})

In [10]:
a.perform_ocr('testdata/fifa22-matchfacts-possession.png', 'matchfacts-possession')
a.get_possession_stats()

{"15'": [36.0, 64.0],
 "30'": [53.0, 47.0],
 "45'": [45.0, 55.0],
 "60'": [44.0, 56.0],
 "75'": [69.0, 31.0],
 "90'": [56.0, 44.0]}

In [11]:
a.perform_ocr('testdata/fifa22-matchfacts-passing.png', 'matchfacts-passing')
a.get_passing_stats()

{'Total Passes': [106.0, 167.0],
 'Completed': [83.0, 144.0],
 'Intercepted': [8.0, 16.0],
 'Offside': [2.0, 0.0],
 'Ground': [57.0, 109.0],
 'Lob': [0.0, 3.0],
 'Through': [45.0, 26.0],
 'Lofted Through': [0.0, 15.0],
 'Cross': [0.0, 2.0],
 'Set Pieces': [1.0, 4.0],
 'Key Pass': [8.0, 17.0],
 'First Time': [24.0, 56.0],
 'Pass and Go': [1.0, 0.0],
 'Down Wing': [0.0, 3.0],
 'Breakaway': [12.0, 2.0]}