In [1]:
import pandas as pd
import spacy
#from atap_widgets.concordance import ConcordanceTable, ConcordanceWidget
from concordance import ConcordanceTable, ConcordanceWidget, ConcordanceLoader
from concordance import prepare_text_df
import dask.bag as db
import re


In [2]:
#Make Some Data

def sherlock_holmes_five_sentences():
    return """To Sherlock Holmes she is always the woman. I have seldom heard him
    mention her under any other name. In his eyes she eclipses and predominates the
    whole of her sex. It was not that he felt any emotion akin to love for Irene
    Adler. All emotions, and that one particularly, were abhorrent to his cold,
    precise but admirably balanced mind. """


def basic_spacy_nlp():
    return spacy.load("en_core_web_sm")


def sherlock_holmes_doc(sherlock_holmes_five_sentences, basic_spacy_nlp):
    return basic_spacy_nlp(sherlock_holmes_five_sentences)

def sherlock_holmes_dummy_df(sherlock_holmes_doc):
    """
    DataFrame, one row per sentence from the Sherlock Holmes example
    """
    df = pd.DataFrame(
        {
            "text": [str(sentence) for sentence in sherlock_holmes_doc.sents],
            "speaker": list("ABABA"),
        }
    )
    return df

data = sherlock_holmes_five_sentences()
sherlock_df = sherlock_holmes_dummy_df(sherlock_holmes_doc(data,basic_spacy_nlp()))

D_path_QandA = '../../tests/data/D.QandA_Dummy.txt'

C_path_PoliticalInterview = '../../tests/private_data/C.Political_Interview_Public.txt'

E_path_transcript = '../../tests/private_data/E.interview_transcript_public.txt'

MarkScottisCool = "../../tests/data/MarkScottNationalPressClub.txt"


## ConcordanceLoader Demo 1 CSV Sherlock Data

**Introduction**
- ConcordanceLoader is a class that caters for CSV, Text Files, and existing DataFrames loaded in python. Text files are a special mention, where symbols can be assigned which are used in the text to designate key- value pairs. If the symbol variable is inputed, DataIngest filters the text for these key-value pairs only and converts it into a dataframe object

- Lines of text are grouped into chunks. The chunk variable is an integer reflecting the number of lines you intend to group as each chunk
- The context is taken from the grouping. 

**Limitations:**
- If the word you are matching begins at the start of a group a larger chunk integer is suggested.
- Lines are tagged with a --[line_number] symbol in the text (which can be removed from the widget). However, if the raw data has this pattern within the text it cause confusion with line tagging method

In [3]:

CHUNK = 2

#DataCSV = ConcordanceLoader(type = "csv",path = "../../tests/data/sherlock_for_testing.csv",chunk = CHUNK) #By Text / Csv file
DataCSV = ConcordanceLoader(type = "dataframe",df_input = sherlock_df,chunk = CHUNK)  # Or exisitng dataframe

DataCSV.show() 

#increasing "Window Size(characters) " bring in context provided text has large enough grouping (i.e. chunks)
# Show More Multiselect dropdown can bring in more than one column by "command + click" when choosing.

  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

<concordance.ConcordanceLoaderWidget at 0x17fe108b0>

In [4]:
#Can explore these helpers if chunk needs tinkering

DataCSV.get_grouped_data()

Unnamed: 0,text,speaker,chunk,row
0,0--To Sherlock Holmes she is always the woman.,A,0,0
1,1--I have seldom heard him\n mention her un...,B,0,1
2,2--In his eyes she eclipses and predominates t...,A,1,2
3,3--It was not that he felt any emotion akin to...,B,1,3
4,"4--All emotions, and that one particularly, we...",A,2,4


In [6]:
#Older DataWidget and Concordance Table Functionality still retained

original_data = DataCSV.get_original_data()

original_data.head() #chuch and row columns added to original data.

data = pd.read_csv("../../tests/data/sherlock_for_testing.csv")                  
data =  prepare_text_df(data)

table = ConcordanceTable(df = data,keyword = "she")
table

search_results_df = table.to_dataframe() #extract results into dataframe
search_results_df.head()

oldWidget = ConcordanceWidget(data) #run simplier widget (no chunks or context)
oldWidget.show()


  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

## ConcordanceLoader Demo 2 Debate Data

### Debate Data

In [7]:
# DataIngest doesnt work that well for large lines and lines with large variability (long plus short lines in test).

CHUNK = 10 #increase chunks to expand context region. i.e. "time" search for instance
data = pd.read_excel("../../tests/data/A.debate_clean.xlsx") #already has text_id
DataDF = ConcordanceLoader(type = "dataframe",df_input = data,chunk = CHUNK)
DataDF.show() #search economy versus environment and bring in speaker from ShowMore dropdown


  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

<concordance.ConcordanceLoaderWidget at 0x29af7c2e0>

### Setup other demos 



## DataIngest Demo 3 Question Answer text

In [8]:
# This is what data looks like
! head -15 $D_path_QandA

Question: What is your favourite animal in Australia?
Name 6: Kangaroos and koalas.

Question: What is your favourite animal in Australia?
Name 1: Wombats are my favourite.

Question: What is your favourite animal in Australia?
Name 10: I don’t know, but I know I don’t like any of the poisonous spiders and dangerous snakes!

Question: What is your favourite food in Australia?
Name 10: Tomatos for sure!

Question: What is your favourite food in Australia?
Name 6: I decline to answer that.



In [10]:
symbol = r':' #can define a symbol to split lines. Assumes all relevant info is in the format key [SYMBOL] value.

CHUNK = 4

DataDF = ConcordanceLoader(type = "txt",path = D_path_QandA,re_symbol_txt = symbol,chunk = CHUNK)

DataDF.show() #search tomatos, pick "key" in "Show More"


  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

<concordance.ConcordanceLoaderWidget at 0x29b1b6790>

In [14]:
# interestingly, can bring in processed (grouped) data as dataframe to explicitly analyse keys that are created.
DataDF.get_grouped_data().sort_values('key').key.unique()


array(['Name 1', 'Name 10', 'Name 6', 'Name11', 'Question'], dtype=object)

### Old Widget doesnt cater for keys / identifiers in text - DataIngest performs better

In [15]:
data = pd.read_table(D_path_QandA,delimiter=':')
data = data.rename(columns = {"Question":"text_id"," What is your favourite animal in Australia?":"text"})
data

data =  prepare_text_df(data)
table = ConcordanceTable(df = data,keyword = "publish")
oldWidget = ConcordanceWidget(data)
oldWidget.show() #kangaroo

  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

## Political Interview Case C

In [16]:
# Data looks like this:
! head -n 20 $C_path_PoliticalInterview

TRANSCRIPT: RADIO INTERVIEW - ABC RN DRIVE - MONDAY, 10 FEBRUARY 2020

10 February 2020

 

THE HON TANYA PLIBERSEK MP
SHADOW MINISTER FOR EDUCATION AND TRAINING
MEMBER FOR SYDNEY

E&OE TRANSCRIPT
RADIO INTERVIEW
ABC RN DRIVE
MONDAY, 10 FEBRUARY 2020

SUBJECTS:Coalition in-fighting and chaos; coronavirus and its impact on higher education; Grattan Institute report into teaching profession.

PATRICIA KARVELAS, HOST: Labor played a key role in these chaotic scenes that saw National Party defector, Llew O'Brien, elected to the position of Deputy Speaker of the House of Representatives this afternoon. Regardless of whether the nomination was a stunt, it's a reality now thanks to Labor's movements; and Llew O'Brien's new reality sees him pocket an extra $20,000 a year. So, all in all, not a bad outcome for him today. The distractions came as Australia's tertiary education sector struggles to come to terms with the prospect of a $3.1 billion hit from coronavirus. Tanya Plibersek is the Shado

In [17]:

symbol = r':' 
CHUNK = 4 #change from 4 to 10 to get billion context

DataDF = ConcordanceLoader(type = "txt",path = C_path_PoliticalInterview,re_symbol_txt = symbol,chunk = CHUNK)

DataDF.show() #billion. Demo Change chunk size and DataDF.get_grouped_data() to see chunk partitians
#DataDF.get_grouped_data() #billion at start of chunk 2 so left context not shown. Change chunk from 4 to 10

  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

<concordance.ConcordanceLoaderWidget at 0x2a100dfd0>

 ## Strategy 

In [18]:
! head -n 20 $MarkScottisCool



I acknowledge that we meet today on the ancestral lands of the Ngunnawal people, the traditional custodians of this land. I pay my respects to elders past and present, and those who have cared for and continue to care for country.

It’s great to be with you.

The University of Sydney is Australia’s oldest university. We took in our first students in 1852 and just yesterday released our aspirations for the decade through to 2032, by which time we’ll be closing in on the end of the University’s second century.

In considering our future, we humbly acknowledge that for hundreds of centuries before the University of Sydney opened its doors, generations of First Nations peoples have been exchanging knowledge on the ancestral lands on which the University’s campuses and facilities now stand. And as we create a university for the future, we aim to extend and build upon this prior knowledge.

Today I want to give you a sense of our vision for the next decade.

I have been Vice-Chancellor at 

In [46]:
### Load simple text with no key/value structure. Just plain text

In [19]:

data = pd.read_table(MarkScottisCool,names = ["text"])
data =  prepare_text_df(data)
table = ConcordanceTable(df = data,keyword = "publish")
oldWidget = ConcordanceWidget(data)
oldWidget.show() #pandemic


  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

In [21]:
# Just Text - i.e. no structure, symbol, column heading etc.....
CHUNK = 4
DataDF = ConcordanceLoader(type = "txt",path = MarkScottisCool)
DataDF.show() #Debug pandemic and "person"


  return Index(sequences[0], name=names)


VBox(children=(Text(value='', description='Keyword(s):'), HBox(children=(Checkbox(value=False, description='En…

<concordance.ConcordanceLoaderWidget at 0x2a7ab3220>

# Harder Case Transcript Case E #TODO

In [11]:
#what does data look like ?
! head -n 10 $E_path_transcript

Speaker 1: (00:00)
"Bill Gates co-founded Microsoft nearly 50 years ago at the forefront of the computer age that changed the world. Since then, he’s been using the fortune that earned him to change the world, the Gates Foundation giving away tens of billions of dollars over the past decade. He also famously predicted the world was unprepared for a pandemic in a 2015 Ted Talk that was unfortunately accurate and has been viewed now 43 million times."

Speaker 1: (00:25)
"Well, he’s sounding the alarm again this morning. His new book How to Prevent the Next Pandemic is out today. It’s also been a year of upheaval in his own life after he and Melinda, his wife of 27 years, announced their divorce one year ago today. So, Bill, we have so much to catch up on. It’s good to see you. Good morning."

Bill Gates: (00:41)
Good to see you.

Speaker 1: (00:41)
