# AI generated sports commentary \[[blog](https://medium.com/@chintan.t93)\] \[[video](https://www.youtube.com/watch?v=p9AmkiG8UeI)\] \[[code](https://github.com/ChintanTrivedi/football_ai_commentary)\]

This notebook is for training a GPT-2 language model to perform on-the-fly commentary for football/soccer games.


## Setup
Clone the GPT-2 repo, install dependencies and download the pre-trained model. 

In [0]:
!git clone https://github.com/nshepperd/gpt-2.git
%cd gpt-2
!pip3 install -r requirements.txt
!python3 download_model.py 345M
!pip install webvtt-py youtube_dl

## Train GPT-2

### Training Data 
- Using YouTube videos of full matches of FIFA and PES video games providing commentary in its audio
- Currently using manual or automated captions obtained directly from YouTube API.
- Captions are downloaded in .webvtt format, which are converted to plain text transcript
- Unfortunately, this does not provide any punctuations in the text, so need to look into alternate text-to-speech options (ones I tried currently do not work)


In [0]:
download_training_data = True # required only if you're looking to retrain

if download_training_data:
  # YouTube URLs of full FIFa/PES matches with uninterrupted game commentary
  urls = [
      'https://www.youtube.com/watch?v=_QL2Vr-Rbhk',    'https://www.youtube.com/watch?v=vW2pn2LsrZU',    'https://www.youtube.com/watch?v=795ttHjcuNA',
      'https://www.youtube.com/watch?v=g8IVEuGy3dk',    'https://www.youtube.com/watch?v=4Kq4hoCWG4c',    'https://www.youtube.com/watch?v=43FbmrkHoiY',
      'https://www.youtube.com/watch?v=O6lVXP1XJrc',    'https://www.youtube.com/watch?v=51Lal4CqJfM',    'https://www.youtube.com/watch?v=-9JXEzCUmKE',
      'https://www.youtube.com/watch?v=r7bsamy9n5c',    'https://www.youtube.com/watch?v=tFf5HiuK6v0',    'https://www.youtube.com/watch?v=Na44QV_Q7ic',
      'https://www.youtube.com/watch?v=iKkkRqBL3pM',    'https://www.youtube.com/watch?v=NWBKXU5boRg',    'https://www.youtube.com/watch?v=A68hJll7Us4',
      'https://www.youtube.com/watch?v=Ch-1BQmTzWI',    'https://www.youtube.com/watch?v=67roKfGj_Fo',    'https://www.youtube.com/watch?v=Euei-fpFlrQ',
      'https://www.youtube.com/watch?v=2JS_6foNnP8',    'https://www.youtube.com/watch?v=pDsabB1HVAM',    'https://www.youtube.com/watch?v=6_h6FpHnuIs',
      'https://www.youtube.com/watch?v=PkSv__cAYfw',    'https://www.youtube.com/watch?v=gRUWMIL9l8g',    'https://www.youtube.com/watch?v=s-iqqWnmgkc',
      'https://www.youtube.com/watch?v=mOFAP0KF2x4',    'https://www.youtube.com/watch?v=E1I9eof_szE',    'https://www.youtube.com/watch?v=fZxanp6sP8c',
      'https://www.youtube.com/watch?v=l3VQ1jC2Iyg',    'https://www.youtube.com/watch?v=l3VQ1jC2Iyg',    'https://www.youtube.com/watch?v=l3VQ1jC2Iyg',
      'https://www.youtube.com/watch?v=qrKbmzYGLEM',    'https://www.youtube.com/watch?v=B8ImIuKQSHg',    'https://www.youtube.com/watch?v=xGh4GciXPQk',
      'https://www.youtube.com/watch?v=1ISKxiGw4K8',    'https://www.youtube.com/watch?v=ShkyV3DyTD4',    'https://www.youtube.com/watch?v=XYlDNfS48sg',
      'https://www.youtube.com/watch?v=_jT2uZLxA7o',    'https://www.youtube.com/watch?v=TNMG98EhKLU',    'https://www.youtube.com/watch?v=TRX-BmKdltY',
      'https://www.youtube.com/watch?v=nAEvAxInIV0',    'https://www.youtube.com/watch?v=RWaOoa0UMcI',    'https://www.youtube.com/watch?v=XJRLRxDC3x0',
      'https://www.youtube.com/watch?v=Ye4cVwfSdAc',    'https://www.youtube.com/watch?v=cF3junSjIAA',    'https://www.youtube.com/watch?v=d18-39m9NoE',
      'https://www.youtube.com/watch?v=rzOddgKPPtI',    'https://www.youtube.com/watch?v=px4EAg0Vbg4',    'https://www.youtube.com/watch?v=-DQuJTTcbdw',
      'https://www.youtube.com/watch?v=3_B3smljvkQ',    'https://www.youtube.com/watch?v=3us3vUoLkac',    'https://www.youtube.com/watch?v=ol4WW_IVGOQ',
      'https://www.youtube.com/watch?v=2cLT4mTqWz4',    'https://www.youtube.com/watch?v=TPZPB2uCh8k',    'https://www.youtube.com/watch?v=JuDngE09WhE',
      'https://www.youtube.com/watch?v=9rRp2gDE7Yc',    'https://www.youtube.com/watch?v=hqrlgNP7E-0',    'https://www.youtube.com/watch?v=fXhf9Yh7tf8',
      'https://www.youtube.com/watch?v=jV3FUzCqQAQ',    'https://www.youtube.com/watch?v=uhBw9DO-z7A',    'https://www.youtube.com/watch?v=rczhz6xeIfQ'
  ]

  import webvtt
  import requests
  import os
  import youtube_dl
  import re

  if not os.path.exists('captions'):
      os.makedirs('captions')

  # final training data file for all videos combined    
  f_train = open("commentary_train.txt", "w")

  # download captions only for all urls
  for ix, url in enumerate(urls):
    ydl = youtube_dl.YoutubeDL({'writesubtitles': True, 'allsubtitles': True, 'writeautomaticsub': True})
    res = ydl.extract_info(url, download=False)
    if res['requested_subtitles'] and res['requested_subtitles']['en']:
      print('Grabbing vtt file from ' + res['requested_subtitles']['en']['url'])
      response = requests.get(res['requested_subtitles']['en']['url'], stream=True)

      f1 = open("captions/commentary{}.txt".format(ix), "w")
      f1.write(response.text)
      f1.close()
      if len(res['subtitles']) > 0:
        print('manual captions')
      else:
        print('automatic_captions')
    else:
      print('Youtube Video does not have any english captions')
      continue

    # convert downloaded webvtt file to plain text transcript
    vtt = webvtt.read("captions/commentary{}.txt".format(ix))
    transcript = ""

    lines = []
    for line in vtt:
        lines.extend(line.text.strip().splitlines())

    previous = None
    for line in lines:
        if line == previous:
           continue
        transcript += " " + line
        previous = line

    print(transcript)
    # replace [Music] and [Applause] keywords that appear in youtube captions before writing transcript
    f_train.write(re.sub('\[.*?\]','',transcript)+'\n')

  f_train.close()

## Model Training
- Uncomment the following and run to train again

In [0]:
!PYTHONPATH=src ./train.py --dataset /content/gpt-2/commentary_train.txt --model_name '345M'
!cp -r /content/gpt-2/checkpoint/run1/* /content/gpt-2/models/345M/
# !cp -r /content/gpt-2/checkpoint/ /content/drive/My\ Drive/
# !cp -r /content/gpt-2/checkpoint/run1/* /content/gpt-2/models/345M/

- Here, I'm just going to pull the model I've trained before from my Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')
!zip run1.zip ./checkpoint/run1/*
!cp run_2.zip /content/drive/My\ Drive/

In [0]:
# !cp -r /content/drive/My\ Drive/checkpoint/run1/* /content/gpt-2/models/345M/

In [0]:
%cd models/345M/
!rm checkpoint
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1iTRezA2oe2B58wDx3FcZosqdcyD06Kwt' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1iTRezA2oe2B58wDx3FcZosqdcyD06Kwt" -O run1.zip && rm -rf /tmp/cookies.txt
!unzip run1.zip
%cd ../..

## Play with the model's conditional samples using a prompt
Example of prompts: 
- That was a really poor pass. You expect a player of his quality to do better.
- What a goal! Wonderful strike from the team captain.
- Welcome to today's game between two top teams in Europe.

In [0]:
!python3 src/interactive_conditional_samples.py --top_k 40 --model_name "345M"

# EXPERIMENTAL CODE

## Freeze model checkpoint to pb file for real-time inference

In [0]:
# import fire
# import json
# import os
# import numpy as np
# import tensorflow as tf

# import model, sample, encoder

# seed=None
# length=40
# temperature=1
# top_k=0

# hparams = model.default_hparams()
# with open('models/345M/hparams.json') as f:
#   hparams.override_from_dict(json.load(f))

# with tf.Session(graph=tf.Graph()) as sess:
#   context = tf.placeholder(tf.int32, [1, None])
#   np.random.seed(seed)
#   tf.set_random_seed(seed)
#   output = sample.sample_sequence(
#       hparams=hparams, length=length,
#       context=context,
#       batch_size=1,
#       temperature=temperature, top_k=top_k
#   )

#   saver = tf.train.Saver()
#   ckpt = tf.train.latest_checkpoint(os.path.join('models', '345M'))
#   saver.restore(sess, ckpt)
  
#   print([n.name for n in tf.get_default_graph().as_graph_def().node])
  
#   # Freeze the graph
#   frozen_graph_def = tf.graph_util.convert_variables_to_constants(sess,sess.graph_def,[output.name])
  
#   # Save the frozen graph
#   with open('output_graph.pb', 'wb') as f:
#     f.write(frozen_graph_def.SerializeToString())

## Alternate speech to text methods

In [0]:
# !pip install youtube_dl
# import youtube_dl

# from __future__ import unicode_literals
# import youtube_dl


# ydl_opts = {
#     'format': 'bestaudio/best',
#     'postprocessors': [{
#         'key': 'FFmpegExtractAudio',
#         'preferredcodec': 'wav',
#         'preferredquality': '192',
#     }],
# }
# with youtube_dl.YoutubeDL(ydl_opts) as ydl:
#     ydl.download(['https://www.youtube.com/watch?v=vW2pn2LsrZU'])

# !pip install SpeechRecognition

# # import speech_recognition as sr
# r = sr.Recognizer()
# with sr.AudioFile("FIFA 17 _ FC Bayern Munich vs FC Barcelona - Full Gameplay (PS4_Xbox One)-vW2pn2LsrZU.wav") as source:
#     audio = r.record(source)

# try:
#     s = r.recognize_google(audio)
#     print("Text: "+s)
# except Exception as e:
#     print("Exception: "+str(e))