<a href="https://colab.research.google.com/github/EliasKng/BT-Code/blob/master/Chart_2_text_reduced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Chart-2-text-reduced

This notebook will provide the functionality of chart-to-text, however, for single value inputs.

So it will do the data-preparation and then put the values into the model and return the summary for the chart.

The goal is to provide a function:

 **createSummary(chartData: ChartData): string**

where the returned string is the summary

## Startup

### Installations & Imports

In [2]:
! pip install -U spacy
! python3 -m spacy download en_core_web_md



import os
import nltk
import re
from typing import List
import spacy

nlp = spacy.load('en_core_web_md')
nltk.download('punkt')




Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[K     |████████████████████████████████| 45.7 MB 139 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Utility functions

In [7]:
def word_tokenize(string: str, language: str = "english") -> List[str]:
    """tokenizes a given string into a list of substrings.

    :param string: String to tokenize.
    :param language: Language. Either one of ``english'' or ``german''.
    """
    if language not in ["english", "german"]:
        raise ValueError("language argument has to be either ``english'' or ``german''")

    # excessive whitespaces
    string = re.sub(r"\s+", " ", string)

    # some unicode characters
    string = string.replace("’", "'")
    string = string.replace("”", '"')
    string = string.replace("“", '"')

    # floating point (e.g., 1.3 => 1.3)
    string = re.sub(r"(\d+)\.(\d+)", r"\g<1>._\g<2>", string)

    # percentage (e.g., below.500 => below .500)
    string = re.sub(r"(\w+)\.(\d+)", r"\g<1> ._\g<2>", string)

    # end of quote
    string = string.replace(".``", ". ``")

    # number with apostrophe (e.g. '90)
    string = re.sub(r"\s'(\d+)", r"' \g<1>", string)

    # names with Initials (e.g. C. J. Miles)
    string = re.sub(r"(^|\s)(\w)\. (\w)\.", r"\g<1>\g<2>._ \g<3>._", string)

    # some dots
    string = string.replace("..", " ..")

    # names with apostrophe => expands temporarily
    string = re.sub(r"\w+'(?!d|s|ll|t|re|ve|\s)", r"\g<0>_", string)

    # win-loss scores (German notation seems to be XX:YY, but this is also the time format,
    # and the times are not tokenized in the original RotoWire. So we manually handle XX:YY
    # expression.
    string = re.sub(r"(\d+)-(\d+)", r"\g<1> - \g<2>", string)

    # actual tokenization
    tokenized = nltk.word_tokenize(string, language=language)

    joined = " ".join(tokenized)
    # shrink expanded name-with-apostrophe expressions
    joined = joined.replace("'_", "'")
    # shrink expanded name-with-initial expressions
    joined = joined.replace("._", ".")
    tokenized = joined.split(" ")

    return tokenized

def cleanAxisLabel(label):
    cleanLabel = re.sub('\s', '_', label)
    cleanLabel = cleanLabel.replace('%', '').replace('*', '')
    return cleanLabel
  
def cleanAxisValue(value):
    #print(value)
    if value == '-' or value == 'nan':
        return '0'
    cleanValue = re.sub('\s', '_', value)
    cleanValue = cleanValue.replace('|', '').replace(',', '').replace('%', '').replace('*', '')
    return cleanValue

def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def openMultiColumnData(df):
    cols = df.columns
    size = df.shape[0]
    return cols, size
  
def getSubject(titleTokens, nerEntities):
    fillers = ['in', 'the', 'and', 'or', 'an', 'as', 'can', 'be', 'a', ':', '-',
              'to', 'but', 'is', 'of', 'it', 'on', '.', 'at', '(', ')', ',', ';']
    entities = {}
    entities['Subject'] = []
    entities['Date'] = []
    # manually find dates, it performs better than using NER
    for word in titleTokens:
        if word.isnumeric():
            if len(word) > 3:
                entities['Date'].append(word)
        elif word.replace('/', '').isnumeric():
            word = word.split('/')[0]
            if len(word) > 3:
                entities['Date'].append(word)
        elif word.replace('-', '').isnumeric():
            word = word.split('-')[0]
            if len(word) > 3:
                entities['Date'].append(word)
    # get named entites from title
    for X in nerEntities:
        if X.label_ == 'GPE' or X.label_ == 'ORG' or X.label_ == 'NORP' or X.label_ == 'LOC':
            cleanSubject = [word for word in X.text.split() if word.isalpha() and word not in fillers]
            if len(cleanSubject) > 0:
                entities['Subject'].append(' '.join(cleanSubject))
        if len(entities['Date']) < 1:
            if X.label_ == 'DATE':
                if X.text.isnumeric():
                    entities['Date'].append(X.text)
    # guess subject if NER doesn't find one
    if len(entities['Subject']) == 0:
        uppercaseWords = [word for word in titleTokens if word[0].isupper()]
        if len(uppercaseWords) > 1:
            guessedSubject = ' '.join(uppercaseWords[1:])
        else:
            guessedSubject = uppercaseWords[0]
        entities['Subject'].append(guessedSubject)
    # print(entities['Date'])
    cleanTitle = [titleWord for titleWord in titleTokens if titleWord.lower() not in fillers]
    return entities, cleanTitle


## Cleaning Dataset

### Titles

In [8]:
def clean_title(title):
  cleanedTitle = word_tokenize(title)
  # replace (2009 - 2016) with (2009 to 2016)
  lastTokens = cleanedTitle[-3:]
  if lastTokens[1] == '-' and lastTokens[0].isnumeric() and lastTokens[2].isnumeric():
    cleanedTitle[-2] = 'to'
  cleanedTitle = ' '.join(cleanedTitle).replace('*', '')
  return cleanedTitle

###Preprocessing
- Converts data tables into a sequence of records (taken as input by the model): `data/*split*/trainData.txt`
- Cleans summary tokens and substitutes any possible tokens with data variables(e.g., 2018 -> templateValue[0][0]): `data/*split*/trainSummary.txt`
- Cleans the title tokens: `data/*split*/trainTitle.txt`
- Labels the occurrences of records mentioned within the summary: `data/*split*/trainDataLabel.txt`
- Labels the summary tokens which match a record: `data/*split*/trainSummaryLabel.txt`
- Saves the gold summaries: `data/*split*/testOriginalSummary.txt`

In [16]:
def preprocessData(df, title, chartType = 'bar_chart'):
  # """
  # df is an df containing the data
  # title is a string which is the cleanedTitle from clean_title()
  # chart_type is a string: ('line_chart' | 'bar_chart')
  # """

  # cols = df.columns
  # size = df.shape[0]
  # cleanCols = [cleanAxisLabel(axis) for axis in cols]
  
  # dataLine = ''
  # colData = []
  
  # for col in df:
  #   vals = df[col].values
  #   cleanVals = [cleanAxisValue(str(value)) for value in vals]
  #   colData.append(cleanVals)
  
  # for m in range(0,size):
  #   axisTypes = []
  #   records = []
  #   dataLabels = []
  #   for axis, n in zip(cols, range(cols.size)):
  #     if is_number(axis[0]):
  #       axisTypes.append('numerical')
  #     else:
  #       axisTypes.append('categorical')
  #     value = str(df.at[m, axis])
  #     cleanValue = cleanAxisValue(value)
  #     record = f"{cleanCols[n]}|{cleanValue}|{n}|{chartType}"
  #     dataLine += f'{record} '
  #     dataLabels.append([0 for item in range(size)])
  dataArr = []
  dataLabelArr = []
  summaryArr = []
  summaryLabelArr = []
  labelList = []
  titleArr = []
  oldSummaryArr = []

  dataRatioArr = []
  captionRatioArr = []

  #assert len(captionFiles) == len(dataFiles) == len(titleFiles)
  #print(len(captionFiles), len(dataFiles), len(titleFiles))
  # may implemented seperately to avoid accidentally ignoring the word rather than month
  months = ['january', 'february', 'march', 'april', 'june', 'july', 'august', 'september', 'november', 'december']

  years = [str(i) for i in range(1850, 2055)]

  fillers = ['in', 'the', 'and', 'or', 'an', 'as', 'can', 'be', 'a', ':', '-',
            'to', 'but', 'is', 'of', 'it', 'on', '.', 'at', '(', ')', ',', ';']
  
  numbers = ['percent', 'percentage', '%', 'hundred', 'thousand', 'million', 'billion', 'trillion',
            'hundreds', 'thousands', 'millions', 'billions', 'trillions']
  
  positiveTrends = ['increased', 'increase', 'increasing', 'grew', 'growing', 'rose', 'rising', 'gained', 'gaining']
  negativeTrends = ['decreased', 'decrease', 'decreasing', 'shrank', 'shrinking', 'fell', 'falling', 'dropped',
                    'dropping']
  
  simpleChartTypes = []
  complexChartTypes = []

  caption = ''
  cols, size = openMultiColumnData(df)
  complexChartTypes.append(chartType)
  cleanCols = [cleanAxisLabel(axis) for axis in cols]
  dataLine = ''
  summaryLabelLine = ""
  colData = []
  for col in df:
      vals = df[col].values
      cleanVals = [cleanAxisValue(str(value)) for value in vals]
      colData.append(cleanVals)
  # iterate through each table row
  for m in range(0, size):
      axisTypes = []
      #rowData = []
      records = []
      dataLabels = []
      for axis, n in zip(cols, range(cols.size)):
          if is_number(axis[0]):
              axisTypes.append('numerical')
          else:
              axisTypes.append('categorical')
          value = str(df.at[m, axis])
          cleanValue = cleanAxisValue(value)
          #rowData.append(cleanValue)
          record = f"{cleanCols[n]}|{cleanValue}|{n}|{chartType}"
          dataLine += f'{record} '
          dataLabels.append([0 for item in range(size)])
  captionSentences = caption.split(' . ')
  if len(captionSentences) >= 4:
      trimmedCaption = (' . ').join(captionSentences[0:3]) + ' .\n'
  else:
      trimmedCaption = (' . ').join(captionSentences)
  captionTokens = trimmedCaption.split()

  labelMap = []
  captionMatchCount = 0
  doc = nlp(title)
  entities, cleanTitle = getSubject(title.split(), doc.ents)

  parallelData = []
  for token, m in zip(captionTokens, range(0, len(captionTokens))):
      # check for duplicates before token replacement
      if m < len(captionTokens) - 1:
          if captionTokens[m] == captionTokens[m + 1]:
              captionTokens.pop(m + 1)
      if token.lower() not in fillers:
          # find labels for summary tokens, call function to replace token with template
          tokenBool, newToken = compareMultiColumnToken(captionTokens, m, cleanTitle, colData,
                                                        cleanCols, entities)
          if tokenBool == 1:
              #print(newToken)
              captionTokens[m] = newToken
              captionMatchCount += 1
      else:
          tokenBool = 0
      # check for duplicates after token replacement
      if m > 0:
          if captionTokens[m - 1] == captionTokens[m]:
              captionTokens.pop(m)
          # check if last token was an un-templated month
          elif captionTokens[m].lower() in months or captionTokens[m] == 'May':
              captionTokens.pop(m)
      else:
          print(token)
          tokenBool = 0
      labelMap.append(str(tokenBool))
  assert len(captionTokens) == len(labelMap)
  # replace tokens with their parallel templates if they exist
  # ex: in 2019 sales was 300 million -> in templateXValue[max] sales was templateYValue[idxmax(x)] million
  if len(parallelData) > 0:
      for item in parallelData:
          template = item[0]
          axis = item[1]
          tokenIndex = item[2]
          try:
              labelMap[tokenIndex] = '1'
              captionTokens[tokenIndex] = template
          except IndexError:
              # TODO find out if this means that any time a pop occurs the replacement is misaligned,
              # maybe track the # of pops and subtract that from tokenIndex
              # this happens twice due to popping values and changing length of list
              print('index error')
              tokenIndex = len(labelMap) - 1
              labelMap[tokenIndex] = '1'
              captionTokens[tokenIndex] = template
  # check for sentences containing a delta value
  newSentences = []
  cleanSentences = ' '.join(captionTokens).split(' . ')
  for sentence, sentIdx in zip(cleanSentences, range(len(cleanSentences))):
      scaleIndicator = False
      trendIndicator = False
      newSentence = []
      for token, tokenIdx in zip(sentence.split(), range(len(sentence))):
          if token == 'templateScale':
              try:
                  scale = captionSentences[sentIdx].split()[tokenIdx]
                  if scale in numbers:
                      scaleIndicator = True
              except:
                  print('scale err')
          if token.lower() in positiveTrends:
              token = 'templatePositiveTrend'
              trendIndicator = True
          elif token.lower() in negativeTrends:
              token = 'templateNegativeTrend'
              trendIndicator = True
          # if there is an unlabelled numeric token in a sentence containing a trend word, assume that token is a delta between two values
          """
          if trendIndicator:
              if token not in years:
                  if is_number(token):
                      sentenceTemplates = [token for token in sentence.split() if 'template' in token]
                      xCount = {token for token in sentenceTemplates if 'templateXValue' in token}
                      yCount = {token for token in sentenceTemplates if 'templateYValue' in token}
                      # also compare 1 x and 1 y s
                      if len(xCount) == 2 or len(yCount) == 2 or (len(xCount) == 1 and len(yCount) == 1):
                          values, indices = getTemplateValues(xCount, yCount, xValueArr, yValueArr)
                          if len(values) > 1:
                              print(token, tokenIdx)
                              print(sentence)
                              print(xValueArr)
                              print(yValueArr)
                              print(xCount, values)
                              print(scale)
                              if scaleIndicator and (scale == 'percent' or scale == 'percentage'):
                                  valueDiff = abs((float(values[1]) - float(values[0]) / float(values[0])) * 100)
                                  rounded1 = abs(normal_round(valueDiff))
                                  rounded2 = abs(normal_round(valueDiff, 1))
                                  print(f'original: {token}, diff:{valueDiff} rounded:{rounded1, rounded2}')
                              else:
                                  valueDiff = abs(float(values[0]) - float(values[1]))
                                  rounded1 = abs(normal_round(valueDiff))
                                  rounded2 = abs(normal_round(valueDiff, 1))
                                  print(f'original: {token}, diff:{valueDiff} rounded:{rounded1, rounded2}')
                              if rounded1 == float(token) or rounded2 == float(token) or valueDiff == float(token):
                                  token = f'templateDelta[{indices[0]},{indices[1]}]'
                                  print('DELTA')"""
          newSentence.append(token)
      newSentences.append(' '.join(newSentence))
  assert len(captionTokens) == len(labelMap)
  dataLabelLine = (' ').join([' '.join([str(item) for item in column]) for column in dataLabels])
  labelCount = sum([len(column) for column in dataLabels])
  assert len(dataLabelLine.split()) == labelCount
  dataMatchCount = sum([sum(column) for column in dataLabels])
  dataRatio = round(dataMatchCount / labelCount, 2)
  #captionRatio = round(captionMatchCount / len(captionTokens), 2)

  for col in colData:
      assert labelCount/len(colData) == len(col)
  dataRatioArr.append(dataRatio)
  #captionRatioArr.append(captionRatio)
  summaryLabelLine = (' ').join(labelMap)
  assert len(captionTokens) == len(summaryLabelLine.split())
  newCaption = (' . ').join(newSentences)
  oldSummaryArr.append(trimmedCaption)
  labelList.append(labelMap)
  dataArr.append(dataLine)
  dataLabelArr.append(dataLabelLine)
  summaryArr.append(newCaption)
  summaryLabelArr.append(summaryLabelLine)
  titleArr.append(title)
  print('dataLine')
  print(dataLine)
  
  print(dataArr)
  print(dataLabelArr)
  print(summaryArr)
  print(summaryLabelArr)
  print(titleArr)

##Testing

In [17]:
testTitles = ['Facebook: number of monthly active users worldwide 2008-2019','National Basketball Association all-time scoring leaders 1946-2020','Instagram accounts with the most followers worldwide 2020']


# Import pandas library
import pandas as pd
  
# initialize list of lists
data = [['White', 457], ['Black', 223], ['Hispanic', 179], ['Other', 44], ['Unknown', 84]]
  
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Name', 'Age'])
  
# print dataframe.
print(df)

preprocessData(df, 'This is just a testTitle')

       Name  Age
0     White  457
1     Black  223
2  Hispanic  179
3     Other   44
4   Unknown   84
captionMatch
0
dataMatch
0
[]
[]
[]
[]
[]
