<a href="https://colab.research.google.com/github/EliasKng/BT-Code/blob/master/Chart_2_text_reduced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Chart-2-text-reduced

This notebook will provide the functionality of chart-to-text, however, for single value inputs.

So it will do the data-preparation and then put the values into the model and return the summary for the chart.

The goal is to provide a function:

 **createSummary(chartData: ChartData): string**

where the returned string is the summary

## Startup

### Installations & Imports

In [2]:
! pip install -U spacy
! python3 -m spacy download en_core_web_md



import os
import nltk
import re
from typing import List
import spacy

nlp = spacy.load('en_core_web_md')
nltk.download('punkt')




Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[K     |████████████████████████████████| 45.7 MB 139 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Utility functions

In [7]:
def word_tokenize(string: str, language: str = "english") -> List[str]:
    """tokenizes a given string into a list of substrings.

    :param string: String to tokenize.
    :param language: Language. Either one of ``english'' or ``german''.
    """
    if language not in ["english", "german"]:
        raise ValueError("language argument has to be either ``english'' or ``german''")

    # excessive whitespaces
    string = re.sub(r"\s+", " ", string)

    # some unicode characters
    string = string.replace("’", "'")
    string = string.replace("”", '"')
    string = string.replace("“", '"')

    # floating point (e.g., 1.3 => 1.3)
    string = re.sub(r"(\d+)\.(\d+)", r"\g<1>._\g<2>", string)

    # percentage (e.g., below.500 => below .500)
    string = re.sub(r"(\w+)\.(\d+)", r"\g<1> ._\g<2>", string)

    # end of quote
    string = string.replace(".``", ". ``")

    # number with apostrophe (e.g. '90)
    string = re.sub(r"\s'(\d+)", r"' \g<1>", string)

    # names with Initials (e.g. C. J. Miles)
    string = re.sub(r"(^|\s)(\w)\. (\w)\.", r"\g<1>\g<2>._ \g<3>._", string)

    # some dots
    string = string.replace("..", " ..")

    # names with apostrophe => expands temporarily
    string = re.sub(r"\w+'(?!d|s|ll|t|re|ve|\s)", r"\g<0>_", string)

    # win-loss scores (German notation seems to be XX:YY, but this is also the time format,
    # and the times are not tokenized in the original RotoWire. So we manually handle XX:YY
    # expression.
    string = re.sub(r"(\d+)-(\d+)", r"\g<1> - \g<2>", string)

    # actual tokenization
    tokenized = nltk.word_tokenize(string, language=language)

    joined = " ".join(tokenized)
    # shrink expanded name-with-apostrophe expressions
    joined = joined.replace("'_", "'")
    # shrink expanded name-with-initial expressions
    joined = joined.replace("._", ".")
    tokenized = joined.split(" ")

    return tokenized

def cleanAxisLabel(label):
    cleanLabel = re.sub('\s', '_', label)
    cleanLabel = cleanLabel.replace('%', '').replace('*', '')
    return cleanLabel
  
def cleanAxisValue(value):
    #print(value)
    if value == '-' or value == 'nan':
        return '0'
    cleanValue = re.sub('\s', '_', value)
    cleanValue = cleanValue.replace('|', '').replace(',', '').replace('%', '').replace('*', '')
    return cleanValue

def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def openMultiColumnData(df):
    cols = df.columns
    size = df.shape[0]
    return cols, size
  
def getSubject(titleTokens, nerEntities):
    fillers = ['in', 'the', 'and', 'or', 'an', 'as', 'can', 'be', 'a', ':', '-',
              'to', 'but', 'is', 'of', 'it', 'on', '.', 'at', '(', ')', ',', ';']
    entities = {}
    entities['Subject'] = []
    entities['Date'] = []
    # manually find dates, it performs better than using NER
    for word in titleTokens:
        if word.isnumeric():
            if len(word) > 3:
                entities['Date'].append(word)
        elif word.replace('/', '').isnumeric():
            word = word.split('/')[0]
            if len(word) > 3:
                entities['Date'].append(word)
        elif word.replace('-', '').isnumeric():
            word = word.split('-')[0]
            if len(word) > 3:
                entities['Date'].append(word)
    # get named entites from title
    for X in nerEntities:
        if X.label_ == 'GPE' or X.label_ == 'ORG' or X.label_ == 'NORP' or X.label_ == 'LOC':
            cleanSubject = [word for word in X.text.split() if word.isalpha() and word not in fillers]
            if len(cleanSubject) > 0:
                entities['Subject'].append(' '.join(cleanSubject))
        if len(entities['Date']) < 1:
            if X.label_ == 'DATE':
                if X.text.isnumeric():
                    entities['Date'].append(X.text)
    # guess subject if NER doesn't find one
    if len(entities['Subject']) == 0:
        uppercaseWords = [word for word in titleTokens if word[0].isupper()]
        if len(uppercaseWords) > 1:
            guessedSubject = ' '.join(uppercaseWords[1:])
        else:
            guessedSubject = uppercaseWords[0]
        entities['Subject'].append(guessedSubject)
    # print(entities['Date'])
    cleanTitle = [titleWord for titleWord in titleTokens if titleWord.lower() not in fillers]
    return entities, cleanTitle


## Cleaning Dataset

### Titles

In [8]:
def clean_title(title):
  cleanedTitle = word_tokenize(title)
  # replace (2009 - 2016) with (2009 to 2016)
  lastTokens = cleanedTitle[-3:]
  if lastTokens[1] == '-' and lastTokens[0].isnumeric() and lastTokens[2].isnumeric():
    cleanedTitle[-2] = 'to'
  cleanedTitle = ' '.join(cleanedTitle).replace('*', '')
  return cleanedTitle

###Preprocessing
- Converts data tables into a sequence of records (taken as input by the model): `data/*split*/trainData.txt`
- Cleans summary tokens and substitutes any possible tokens with data variables(e.g., 2018 -> templateValue[0][0]): `data/*split*/trainSummary.txt`
- Cleans the title tokens: `data/*split*/trainTitle.txt`
- Labels the occurrences of records mentioned within the summary: `data/*split*/trainDataLabel.txt`
- Labels the summary tokens which match a record: `data/*split*/trainSummaryLabel.txt`
- Saves the gold summaries: `data/*split*/testOriginalSummary.txt`

In [33]:
def preprocessData(df, title, chartType = 'bar_chart'):
  # """
  # df is an df containing the data
  # title is a string which is the cleanedTitle from clean_title()
  # chart_type is a string: ('line_chart' | 'bar_chart')
  # """

  # cols = df.columns
  # size = df.shape[0]
  # cleanCols = [cleanAxisLabel(axis) for axis in cols]
  
  # dataLine = ''
  # colData = []
  
  # for col in df:
  #   vals = df[col].values
  #   cleanVals = [cleanAxisValue(str(value)) for value in vals]
  #   colData.append(cleanVals)
  
  # for m in range(0,size):
  #   axisTypes = []
  #   records = []
  #   dataLabels = []
  #   for axis, n in zip(cols, range(cols.size)):
  #     if is_number(axis[0]):
  #       axisTypes.append('numerical')
  #     else:
  #       axisTypes.append('categorical')
  #     value = str(df.at[m, axis])
  #     cleanValue = cleanAxisValue(value)
  #     record = f"{cleanCols[n]}|{cleanValue}|{n}|{chartType}"
  #     dataLine += f'{record} '
  #     dataLabels.append([0 for item in range(size)])
  dataArr = []
  dataLabelArr = []
  summaryArr = []
  summaryLabelArr = []
  labelList = []
  titleArr = []
  oldSummaryArr = []

  dataRatioArr = []
  captionRatioArr = []

  #assert len(captionFiles) == len(dataFiles) == len(titleFiles)
  #print(len(captionFiles), len(dataFiles), len(titleFiles))
  # may implemented seperately to avoid accidentally ignoring the word rather than month
  months = ['january', 'february', 'march', 'april', 'june', 'july', 'august', 'september', 'november', 'december']

  years = [str(i) for i in range(1850, 2055)]

  fillers = ['in', 'the', 'and', 'or', 'an', 'as', 'can', 'be', 'a', ':', '-',
            'to', 'but', 'is', 'of', 'it', 'on', '.', 'at', '(', ')', ',', ';']
  
  numbers = ['percent', 'percentage', '%', 'hundred', 'thousand', 'million', 'billion', 'trillion',
            'hundreds', 'thousands', 'millions', 'billions', 'trillions']
  
  positiveTrends = ['increased', 'increase', 'increasing', 'grew', 'growing', 'rose', 'rising', 'gained', 'gaining']
  negativeTrends = ['decreased', 'decrease', 'decreasing', 'shrank', 'shrinking', 'fell', 'falling', 'dropped',
                    'dropping']
  
  simpleChartTypes = []
  complexChartTypes = []

  caption = ''
  cols, size = openMultiColumnData(df)
  complexChartTypes.append(chartType)
  cleanCols = [cleanAxisLabel(axis) for axis in cols]
  dataLine = ''
  summaryLabelLine = ""
  colData = []
  for col in df:
      vals = df[col].values
      cleanVals = [cleanAxisValue(str(value)) for value in vals]
      colData.append(cleanVals)
  # iterate through each table row
  for m in range(0, size):
      axisTypes = []
      #rowData = []
      records = []
      dataLabels = []
      for axis, n in zip(cols, range(cols.size)):
          if is_number(axis[0]):
              axisTypes.append('numerical')
          else:
              axisTypes.append('categorical')
          value = str(df.at[m, axis])
          cleanValue = cleanAxisValue(value)
          #rowData.append(cleanValue)
          record = f"{cleanCols[n]}|{cleanValue}|{n}|{chartType}"
          print(record)
          dataLine += f'{record} '
          dataLabels.append([0 for item in range(size)])

  return dataLine, title

##Testing

In [None]:
testTitles = ['Facebook: number of monthly active users worldwide 2008-2019','National Basketball Association all-time scoring leaders 1946-2020','Instagram accounts with the most followers worldwide 2020']


# Import pandas library
import pandas as pd
from io import StringIO

TESTDATA = StringIO("""Country,Average number of children per woman
Singapore,1.38
Republic of Korea,1.44
United Arab Emirates,1.45
Puerto Rico,1.45
Bosnia and Herzegovina,1.47
Saint Lucia,1.48
Greece,1.5
Cyprus,1.51
Italy,1.51
Republic of Moldova,1.52
"China, Taiwan Province of China",1.53
Albania,1.53
Mauritius,1.54
Thailand,1.54
Qatar,1.56
Nepal,1.56
Croatia,1.56
Japan,1.57
Serbia,1.57
Brazil,1.58
North Macedonia,1.58
Brunei Darussalam,1.59
Portugal,1.59
Spain,1.59
Canada,1.59
    """)

df = pd.read_csv(TESTDATA, sep=",")
  
# print dataframe.
cleanedTitle = clean_title('Countries with the lowest fertility rate globally 2050-2055')
dataline, titleline = preprocessData(df, cleanedTitle)

print(dataline, ' ')
print(titleline)