In [2]:
# text 简单清洗
import re

def remove_non_english(text):
    # Only keep ASCII letters, numbers, punctuation, apostrophes, and whitespace
    pattern = re.compile(r'[^\x00-\x7F]+[\W_]?')
    # Remove non-English characters
    clean_text = pattern.sub('', text)
    return clean_text

In [3]:
import json

input_json = "data/tracking_out_ocr_v3.json"
output_json = "data/tracking_out_ocr__v3_clean.json"

with open (input_json, 'r') as load_data:
    texts = json.load(load_data)
for idx, text in texts.items():
    texts[idx] = remove_non_english(text)

with open (output_json, 'w') as data:
    json.dump(texts, data)

In [None]:
# text 内容补充，拼接音频识别信息
from similarity.normalized_levenshtein import NormalizedLevenshtein

THRESHOULD = 0.3
def is_speech_correct(video_text, speech_text):
    normalized_levenshtein = NormalizedLevenshtein()
    ans = normalized_levenshtein.similarity(video_text, speech_text)
    if ans >= THRESHOULD:
        return True
    else:
        return False

In [14]:
# text 二次处理, seq2seq 重写text
import os
import openai
from time import sleep
def improve_text(origin_text):
    sleep(3)
    openai.api_key = os.getenv("OPENAI_API_KEY")
    # res = openai.Edit.create(
    # model="text-davinci-edit-001",
    # input=origin_text,
    # instruction="Fix the OCR errors",
    # n=3,
    # temperature=0.8
    # )
    if len(origin_text) > 8000:
        origin_text = origin_text[:8000]
    prompt = """
    The following text contains some OCR recognition errors and duplicate content when merging video frames.
    Now you need to:
    1. Correct the mistakes in words.
    2. Correct the grammatical errors in the text.
    3. When the paragraphs contain repeated sentences or similar sentences, please summarize this part.
    Note that all modifications keep the original vocabulary as much as possible, do not add new vocabulary when modifying.
   
    If all the above modifications are not possible for you, please output the original text.
    
    For example:
    Origin context: Thenmodellis very precise,lis very preciseis very preciseThenmodel ,Thenmodel is very precise,100%high o ualitysoft material,3\uff1a02, ualitysoft material100%high o uality100%high o ,100%high o uality soft material,ycarcary,\u7eff\u8272Green,y\u7070\u8272Gra\u7070\u8272Grav,\u7c89\u8272Pink,\u68d5\u8272Brown,\u7ea2\u8272Red,\u84dd\u8272Blue,\u7d2b\u8272Purple,\u73ab\u7470\u91d1Rose gold
    
    Rewritten context: The model is very precise. 100% high quality soft material. Easy to carry. Green, Gray, Pink, Brown, Red, Blue, Purple, Rose gold.
    
    Now, your turn:
    Origin context: {0}

    Rewritten context:
    """.format(origin_text)
    response = openai.ChatCompletion.create(
    model='gpt-3.5-turbo',
    messages=[
        {'role': 'user', 'content': prompt},
    ],
    temperature=0.5,
)
    new_text = origin_text
    try:
        # new_text = res['choices'][0]['text']
        new_text = response['choices'][0]['message']['content']
    except KeyError:
        pass
    return new_text


In [77]:
improve_text("the,Livelhe,ileyoulove,nlove,Live the,thelife,ifeyou,Livethelifeyou loy,Livethe life youlove,Livethe life vou love,the lile youlove,Live theliteyou love,Live the lireyou,Livethe,Livet,ulove,11,thefeyuulove,olove,Livethelileyoulove,Live the life you love,Live the life,iove,L.ive the life yo,Live thet,Live thelifeyou love,ivethelifeyou love,thelifl,Yau love,helileyoulove")

"Live the life you love. Don't just exist, but truly live. Follow your passions and do what makes you happy. Embrace the life you have and make the most of it."

In [15]:
import json
from tqdm import tqdm
def gpt_improve_text(PATH,RES_PATH):
    with open(PATH, 'r') as load_data:
        texts = json.load(load_data)
    for idx, text in tqdm(texts.items()):
        texts[idx] = improve_text(text)
    with open (RES_PATH, 'w') as data:
        json.dump(texts, data)


100%|██████████| 250/250 [32:19<00:00,  7.76s/it]


In [89]:
with open('data/tracking_out_ocr__v2_clean.json', 'r') as load_data:
    texts = json.load(load_data)

In [88]:
count_set = set()

In [91]:
from tqdm import tqdm
for idx, text in tqdm(texts.items()):
    if idx not in count_set:
        texts[idx] = improve_text(text)
        count_set.add(idx)


100%|██████████| 250/250 [32:20<00:00,  7.76s/it] 


In [83]:
with open ("data/tracking_out_ocr_gpt3.5_v4.json", 'w') as data:
    json.dump(texts, data)