This notebook contains the code to script the news from Reuter website and the data preprocess of a given file "annual_report_29.json". The file is prepared to be run in google colab environment

<b>[IMPORTANT]</b>: Before going into the code below, add the shortcut of the Project folder to your drive:<br>
1. Right-click our project folder
2. Click "Add shortcut to Drive"
3. Select "My Drive" and click "ADD SHORTCUT"

# Prerequisites

In [None]:
# Connect to Google Drive
# [IMPORTANT]: Remember to add the shortcut of the project folder
#        to "My Drive" before you process to next step.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Source 1: Reuter News

This section contains code to scrapt news from the Reuter website https://www.reuters.com/companies/0700.HK/news for three companies Tencent(700), EverGrande(3333), Alibaba(9988). It has theen inputted to the spacy model, split into sentence and perform combination on entities for sentence contains more than two useful entities. The code output three xlsx files named "News_Alibaba", "News_EverGrande", "News_Tencent"

In [None]:
#!pip install spacy
#pip install requests
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import combinations
import numpy as np
import spacy
import urllib.parse

In [None]:
pd.set_option('max_colwidth', 400)

In [None]:
code=['0700','3333','9988']
news=[]

for company in code:
  url = "https://www.reuters.com/companies/{}.HK/news".format("".join(company))
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser") 
  titles = soup.select(".item")
  news_url=[]
  for title in titles:
      news_url.append(title.select_one("a").get("href"))
  for title in news_url:
    response = requests.get(title)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraph = soup.find_all("p")
    for sentence in paragraph:
      if (str(sentence).find("data-testid=\"paragraph"))!= -1:
        start=str(sentence).find(">")+1
        new_sentence=str(sentence)[start:]
        openn=new_sentence.find("<")
        close=new_sentence.find(">")
        while (openn!=-1):
          new_sentence=new_sentence[:openn]+new_sentence[close+1:]
          openn=new_sentence.find("<")
          close=new_sentence.find(">")
        news.append(new_sentence)

In [None]:
print(len(news))
news[:5]

695


['BEIJING/HONG KONG, Jan 20 (Reuters) - TikTok owner ByteDance saw its total revenue grow by 70% year on year to around $58 billion in 2021, according to two people familiar with the matter, slower growth than a year earlier as China tightens its regulation of big tech companies.',
 'The figures were disclosed to a small group of employees at an internal meeting of the social media giant this week, according to the people.',
 "In 2020, the Beijing-based company's total revenue grew by over 100% to $34.3 billion, Reuters has reported.  read more ",
 'ByteDance did not immediately respond to a request for comment.',
 "Chinese tech companies from Tencent Holdings(0700.HK)\n to Alibaba Group(9988.HK)\n have reported slowing growth amid a wide-ranging crackdown by the country's regulators who have rolled out new rules governing how they operate and interact with their users."]

In [None]:
#Reference from https://newbedev.com/how-can-i-split-a-text-into-sentences
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|vs)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|HK|O|N|L|AS|J|SS)"
digits   = "([0-9])"
date     = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[.]"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(digits + "[.]" + " ", "\\1<prd> ",text)
    text = re.sub(date, "\\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s[1:] for s in sentences]
    return sentences

In [None]:
#https://spacy.io/usage/spacy-101
nlp = spacy.load("en_core_web_sm")
df = pd.DataFrame(columns=['Text', 'NER'])

output=[]
outputEntity=[]

for paragraph in news:
  originalText = paragraph
  originalTextList = split_into_sentences(originalText)
  if originalTextList == []:
    originalTextList.append(originalText) 

  textlist=[]
  listofNERlist=[] #list of listPPLORG
  listPPLORG =[] #list of ppl and org entities

  for sentence in originalTextList:
    doc = nlp(sentence)
    for ent in doc.ents:
      if (ent.label_ == "ORG" or ent.label_ == "PERSON"):
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)
        listPPLORG.append(ent.text)

  #if the paragraph has less than 2 entity, skip the loop
  if len(listPPLORG)<=1:
    continue

  df = pd.DataFrame()
  df["entity"]=listPPLORG
  sentencecount=0
  for sentence in originalTextList:
    freq=[]
    UniEntity=0
    for entity in listPPLORG:
      freq.append(sentence.count(entity))
      if (sentence.count(entity)!=0):
        UniEntity=UniEntity+1
    
    df[str(sentencecount)]=np.array(freq)
    sentencecount=sentencecount+1
  
  lastrow=["UniqueEntity"]
  for i in np.count_nonzero(df, axis=0).tolist()[1:]:
    lastrow.append(i)
  
  SentenceRow=[]
  for i in originalTextList:
    SentenceRow.append(i)
  df.loc[len(df)] =lastrow

  count = 0
  #each column represent one sentence
  for column in df:
    #count variable is for skipping the first column
    if count != 0:
      #if it has more than one unique entity
      if (df.iloc[-1,][column]>=2):
        ListEntityInSentence=df.loc[df[column]>=1, 'entity'].tolist()[:-1]
        ListTwoEntityCombo=list(combinations(ListEntityInSentence,2))
        for TwoEntity in ListTwoEntityCombo:
          tempSentence=SentenceRow[int(column)]
          
          position1=tempSentence.find(TwoEntity[0])
          position2=tempSentence.find(TwoEntity[1])
          
          if (position1<position2):
            completeSentence = tempSentence[0:position1]+"<e1>"+TwoEntity[0]+"</e1>"+tempSentence[position1+len(TwoEntity[0]):position2]+"<e2>"+TwoEntity[1]+"</e2>"+tempSentence[position2+len(TwoEntity[1]):]
          else:
            completeSentence = tempSentence[0:position2]+"<e2>"+TwoEntity[1]+"</e2>"+tempSentence[position2+len(TwoEntity[1]):position1]+"<e1>"+TwoEntity[0]+"</e1>"+tempSentence[position1+len(TwoEntity[0]):]  
          if len(completeSentence)<=512:
            output.append(completeSentence)
            outputEntity.append(TwoEntity)

    count=count+1
Export = pd.DataFrame({'newText':output,'newNER':outputEntity})
Export.to_excel("/content/drive/MyDrive/GitHub/News_Tencent.xlsx")

In [None]:
Export.head()

Unnamed: 0,newText,newNER
0,"BEIJING/HONG KONG, Jan 20 (<e1>Reuters</e1>) - TikTok owner <e2>ByteDance</e2> saw its total revenue grow by 70% year on year to around $58 billion in 2021, according to two people familiar with the matter, slower growth than a year earlier as China tightens its regulation of big tech companies.","(Reuters, ByteDance)"
1,"<e1>ByteDance</e1> retained its second-ranked position in China’s online advertising market last year, with a market share of 21%, according to a recent report published by researcher <e2>Interactive Marketing Lab Zhongguancun</e2>.","(ByteDance, Interactive Marketing Lab Zhongguancun)"
2,"In November, <e2>ByteDance</e2><e1>ByteDance</e1> reorganised itself into six business units in its biggest organizational change since ByteDance founder Zhang Yiming said in May he would step down as CEO.","(ByteDance, ByteDance)"
3,"In November, <e1>ByteDance</e1> reorganised itself into six business units in its biggest organizational change since ByteDance founder <e2>Zhang Yiming</e2> said in May he would step down as CEO.","(ByteDance, Zhang Yiming)"
4,"In November, <e1>ByteDance</e1> reorganised itself into six business units in its biggest organizational change since ByteDance founder <e2>Zhang Yiming</e2> said in May he would step down as CEO.","(ByteDance, Zhang Yiming)"


# Data source 2: Annual Report

This section of code preprocess the json file contains the 29 annual report with entities labelled data shared by Dr. Chow's research team. It output a xlsx file named "Training_data_with_stock_code" contains the pre-processed data

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
import json

In [None]:
with open('drive/MyDrive/GitHub/annual_report_29.json', encoding="utf-8") as f:
  data = json.load(f)

In [None]:
print("Number of annual reports: ",len(data))
data[0]

Number of annual reports:  58


{'document_type': 'annual_report',
 'language': 'en',
 'relation': [{'age': '63',
   'education': [],
   'join_date': 'September 2017',
   'name': 'Mark E Tucker',
   'past_experience': 'Prudential plc',
   'position': 'Group Chairman'},
  {'age': '59',
   'education': [],
   'join_date': '1992',
   'name': 'Noel Quinn',
   'past_experience': 'Forward Trust Group',
   'position': 'Group Managing Director'},
  {'age': '54',
   'education': [],
   'join_date': '2014 to 2018',
   'name': 'Ewen Stevenson',
   'past_experience': 'Credit Suisse',
   'position': 'Group Chief Financial Ofcer'},
  {'age': '66',
   'education': [],
   'join_date': '1989',
   'name': 'Henri de Castries',
   'past_experience': 'French Finance Ministry Inspection Ofce and the French Treasury Department',
   'position': 'Independent non-executive Director'},
  {'age': '71',
   'education': [],
   'join_date': 'March 2011',
   'name': 'Laura Cha',
   'past_experience': '',
   'position': 'Independent non-executive Di

In [None]:
def DataClean (data):
    count=0

    col1_text=[]
    col2_ner=[]
    col0_stock=[]

    for annualreport in data:
        countt = 0
        if annualreport["language"] == "zh":
          continue
        for sentence in annualreport["text"]:
            if sentence['ner'] != "[]": #sentence shd be a dictionary with 'text' and 'ner' two keys
                col0_stock.append(annualreport["stock_code"])
                col1_text.append(sentence['text'])
                col2_ner.append(sentence['ner'])
                count=count+1
                countt=countt+1
                df = pd.DataFrame({'Stock':col0_stock,'Text':col1_text, 'NER':col2_ner})
    return df

result=DataClean(data)

In [None]:
result.head()

Unnamed: 0,Stock,Text,NER
0,5,Mark E Tucker (63) Group Chairman Appointed to the Board: September 2017 Group Chairman since: October 2017,"[{""text"": ""Mark E Tucker"", ""label"": ""PERSON"", ""start"": 0, ""end"": 13}, {""text"": ""63"", ""label"": ""CARDINAL"", ""start"": 15, ""end"": 17}, {""text"": ""September 2017"", ""label"": ""DATE"", ""start"": 58, ""end"": 72}, {""text"": ""October 2017"", ""label"": ""DATE"", ""start"": 95, ""end"": 107}]"
1,5,"Skills and experience: With over 30 years’ experience in ﬁnancial services in Asia and the UK, Mark has a deep understanding of the industry and the markets in which we operate.","[{""text"": ""over 30 years’"", ""label"": ""DATE"", ""start"": 28, ""end"": 42}, {""text"": ""Asia"", ""label"": ""LOC"", ""start"": 78, ""end"": 82}, {""text"": ""UK"", ""label"": ""GPE"", ""start"": 91, ""end"": 93}, {""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 95, ""end"": 99}]"
2,5,"Career: Mark was previously Group Chief Executive and President of AIA Group Limited (‘AIA’). Prior to joining AIA, he held various senior management roles with Prudential plc, including as Group Chief Executive for four years. He served on Prudential’s Board for 10 years.","[{""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 8, ""end"": 12}, {""text"": ""AIA Group Limited"", ""label"": ""ORG"", ""start"": 67, ""end"": 84}, {""text"": ""AIA"", ""label"": ""ORG"", ""start"": 111, ""end"": 114}, {""text"": ""Prudential plc"", ""label"": ""ORG"", ""start"": 161, ""end"": 175}, {""text"": ""four years"", ""label"": ""DATE"", ""start"": 216, ""end"": 226}, {""text"": ""Prudential"", ""label"": ""ORG"", ""start"": 241, ""end"": 251}, {""..."
3,5,"Mark previously served as non-executive Director of the Court of The Bank of England, as an independent non-executive Director of Goldman Sachs Group and as Group Finance Director of HBOS plc.","[{""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 0, ""end"": 4}, {""text"": ""The Bank of England"", ""label"": ""ORG"", ""start"": 65, ""end"": 84}, {""text"": ""Goldman Sachs Group"", ""label"": ""ORG"", ""start"": 130, ""end"": 149}, {""text"": ""HBOS plc"", ""label"": ""ORG"", ""start"": 183, ""end"": 191}]"
4,5,External appointments: – Chair of TheCityUK – Non-executive Chairman of Discovery Limited – Member of Build Back Better Council – Supporting Chair of Chapter Zero,"[{""text"": ""TheCityUK"", ""label"": ""ORG"", ""start"": 34, ""end"": 43}, {""text"": ""Discovery Limited"", ""label"": ""ORG"", ""start"": 72, ""end"": 89}, {""text"": ""Build Back Better Council"", ""label"": ""ORG"", ""start"": 102, ""end"": 127}, {""text"": ""Chapter Zero"", ""label"": ""LAW"", ""start"": 150, ""end"": 162}]"


In [None]:
result['lenofNER'] = result['NER'].map(lambda NER:len(list(NER.split("{")))-1)
result['OrgFreq'] = result['NER'].map(lambda NER:NER.count("ORG"))
result['PersonFreq'] = result['NER'].map(lambda NER:NER.count("PERSON"))
result['UsefulEntity']=result['OrgFreq']+result['PersonFreq']
result[result['UsefulEntity']>=2]
result.head()

Unnamed: 0,Stock,Text,NER,lenofNER,OrgFreq,PersonFreq,UsefulEntity
0,5,Mark E Tucker (63) Group Chairman Appointed to the Board: September 2017 Group Chairman since: October 2017,"[{""text"": ""Mark E Tucker"", ""label"": ""PERSON"", ""start"": 0, ""end"": 13}, {""text"": ""63"", ""label"": ""CARDINAL"", ""start"": 15, ""end"": 17}, {""text"": ""September 2017"", ""label"": ""DATE"", ""start"": 58, ""end"": 72}, {""text"": ""October 2017"", ""label"": ""DATE"", ""start"": 95, ""end"": 107}]",4,0,1,1
1,5,"Skills and experience: With over 30 years’ experience in ﬁnancial services in Asia and the UK, Mark has a deep understanding of the industry and the markets in which we operate.","[{""text"": ""over 30 years’"", ""label"": ""DATE"", ""start"": 28, ""end"": 42}, {""text"": ""Asia"", ""label"": ""LOC"", ""start"": 78, ""end"": 82}, {""text"": ""UK"", ""label"": ""GPE"", ""start"": 91, ""end"": 93}, {""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 95, ""end"": 99}]",4,0,1,1
2,5,"Career: Mark was previously Group Chief Executive and President of AIA Group Limited (‘AIA’). Prior to joining AIA, he held various senior management roles with Prudential plc, including as Group Chief Executive for four years. He served on Prudential’s Board for 10 years.","[{""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 8, ""end"": 12}, {""text"": ""AIA Group Limited"", ""label"": ""ORG"", ""start"": 67, ""end"": 84}, {""text"": ""AIA"", ""label"": ""ORG"", ""start"": 111, ""end"": 114}, {""text"": ""Prudential plc"", ""label"": ""ORG"", ""start"": 161, ""end"": 175}, {""text"": ""four years"", ""label"": ""DATE"", ""start"": 216, ""end"": 226}, {""text"": ""Prudential"", ""label"": ""ORG"", ""start"": 241, ""end"": 251}, {""...",7,4,1,5
3,5,"Mark previously served as non-executive Director of the Court of The Bank of England, as an independent non-executive Director of Goldman Sachs Group and as Group Finance Director of HBOS plc.","[{""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 0, ""end"": 4}, {""text"": ""The Bank of England"", ""label"": ""ORG"", ""start"": 65, ""end"": 84}, {""text"": ""Goldman Sachs Group"", ""label"": ""ORG"", ""start"": 130, ""end"": 149}, {""text"": ""HBOS plc"", ""label"": ""ORG"", ""start"": 183, ""end"": 191}]",4,3,1,4
4,5,External appointments: – Chair of TheCityUK – Non-executive Chairman of Discovery Limited – Member of Build Back Better Council – Supporting Chair of Chapter Zero,"[{""text"": ""TheCityUK"", ""label"": ""ORG"", ""start"": 34, ""end"": 43}, {""text"": ""Discovery Limited"", ""label"": ""ORG"", ""start"": 72, ""end"": 89}, {""text"": ""Build Back Better Council"", ""label"": ""ORG"", ""start"": 102, ""end"": 127}, {""text"": ""Chapter Zero"", ""label"": ""LAW"", ""start"": 150, ""end"": 162}]",4,3,0,3


In [None]:
output=[]
outputEntity=[]
stock=[]
lessthanlimit=0
for i in range(len(result["NER"])):
  
  originalStock = result["Stock"][i] 
  originalText = result["Text"][i]
  originalTextList = split_into_sentences(originalText)
  if originalTextList == []:
    originalTextList.append(originalText) 
  
  listAllNERDict=json.loads(result["NER"][i]) #convert the string to list of dictionary
  listPPLORGDict=[]  #keep only NER with label is Person or Org
  listPPLORG =[]   #list of entity text
  
  for entity in listAllNERDict:
    if entity['label'] == 'PERSON' or entity['label'] == 'ORG':
        listPPLORGDict.append(entity)
        if entity['text'] not in listPPLORG:
          listPPLORG.append(entity['text'])

  #if the paragraph has less than 2 entity, skip the loop
  if len(listPPLORG)<=1:
    continue
  
  df = pd.DataFrame()
  df["entity"]=listPPLORG
  sentencecount=0
  for sentence in originalTextList:
    freq=[]
    UniEntity=0
    for entity in listPPLORG:
      freq.append(sentence.count(entity))
      if (sentence.count(entity)!=0):
        UniEntity=UniEntity+1
    
    df[str(sentencecount)]=np.array(freq)
    sentencecount=sentencecount+1

  lastrow=["UniqueEntity"]
  for i in np.count_nonzero(df, axis=0).tolist()[1:]:
    lastrow.append(i)
  
  SentenceRow=[]
  for i in originalTextList:
    SentenceRow.append(i)
  df.loc[len(df)] =lastrow
  
  
  count = 0
  #each column represent one sentence
  for column in df:
    #count variable is for skipping the first column
    if count != 0:
      #if it has more than one unique entity
      if (df.iloc[-1,][column]>=2):
        ListEntityInSentence=df.loc[df[column]>=1, 'entity'].tolist()[:-1]
        ListTwoEntityCombo=list(combinations(ListEntityInSentence,2))
        for TwoEntity in ListTwoEntityCombo:
          tempSentence=SentenceRow[int(column)]
          
          position1=tempSentence.find(TwoEntity[0])
          position2=tempSentence.find(TwoEntity[1])
          
          if (position1<position2):
            completeSentence = tempSentence[0:position1]+"<e1>"+TwoEntity[0]+"</e1>"+tempSentence[position1+len(TwoEntity[0]):position2]+"<e2>"+TwoEntity[1]+"</e2>"+tempSentence[position2+len(TwoEntity[1]):]
          else:
            completeSentence = tempSentence[0:position2]+"<e2>"+TwoEntity[1]+"</e2>"+tempSentence[position2+len(TwoEntity[1]):position1]+"<e1>"+TwoEntity[0]+"</e1>"+tempSentence[position1+len(TwoEntity[0]):]  
          if len(completeSentence)<=512:
            lessthanlimit=lessthanlimit+1
            output.append(completeSentence)
            outputEntity.append(TwoEntity)
            stock.append(originalStock)
    count=count+1
Export = pd.DataFrame({'Stock':stock,'newText':output,'newNER':outputEntity})
Export.to_excel("/content/drive/MyDrive/GitHub/Training_data_with_stock_code.xlsx")

In [None]:
Export.head()

Unnamed: 0,Stock,newText,newNER
0,5,Career: <e1>Mark</e1> was previously Group Chief Executive and President of <e2>AIA Group Limited</e2> (‘AIA’).,"(Mark, AIA Group Limited)"
1,5,Career: <e1>Mark</e1> was previously Group Chief Executive and President of <e2>AIA</e2> Group Limited (‘AIA’).,"(Mark, AIA)"
2,5,Career: Mark was previously Group Chief Executive and President of <e2>AIA</e2><e1>AIA Group Limited</e1> (‘AIA’).,"(AIA Group Limited, AIA)"
3,5,"Prior to joining <e1>AIA</e1>, he held various senior management roles with <e2>Prudential plc</e2>, including as Group Chief Executive for four years.","(AIA, Prudential plc)"
4,5,"Prior to joining <e1>AIA</e1>, he held various senior management roles with <e2>Prudential</e2> plc, including as Group Chief Executive for four years.","(AIA, Prudential)"


# Preprocess the data to input to the model

Two files are combined and manually labelled with respect to 11 relations. Then we taken out the data related to Tencent and the data are stored in "TrainingData_Without_Tencent.xlsx". Since the BERT model intakes data in a txt file format, the following code output two txt files named "train_data_dm1.txt" and "test_data_dm1.txt"

In [None]:
# Import the libs to be used
import numpy as np
import pandas as pd
import re
regex = re.compile('[^\u0020-\u024F]')

In [None]:
input_file = pd.read_excel("/content/drive/MyDrive/GitHub/TrainingData_Without_Tencent.xlsx")

In [None]:
input_file=input_file[input_file['Label'].notna()]
input_file=input_file[input_file['newText'].apply(lambda x: len(x) <=512)]
input_file['newText']=input_file['newText'].apply(lambda x: regex.sub('', x))
input_file=input_file.reset_index(drop=True)

In [None]:
input_file.head()

Unnamed: 0,Stock,newText,newNER,Label,Relationship
0,5,"Career: <e1>Mark</e1> was previously Group Chief Executive and President of <e2>AIA Group Limited</e2> (AIA). Prior to joining AIA, he held various senior management roles with Prudential plc, including as Group Chief Executive for four years. He served on Prudentials Board for 10 years.","['""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 8, ""end"": 12', '""text"": ""AIA Group Limited"", ""label"": ""ORG"", ""start"": 67, ""end"": 84']",Employee-Company,"(e1,e2)"
1,5,"Career: <e1>Mark</e1> was previously Group Chief Executive and President of AIA Group Limited (AIA). Prior to joining <e2>AIA</e2>, he held various senior management roles with Prudential plc, including as Group Chief Executive for four years. He served on Prudentials Board for 10 years.","['""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 8, ""end"": 12', '""text"": ""AIA"", ""label"": ""ORG"", ""start"": 111, ""end"": 114']",Employee-Company,"(e1,e2)"
2,5,"Career: <e1>Mark</e1> was previously Group Chief Executive and President of AIA Group Limited (AIA). Prior to joining AIA, he held various senior management roles with <e2>Prudential plc</e2>, including as Group Chief Executive for four years. He served on Prudentials Board for 10 years.","['""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 8, ""end"": 12', '""text"": ""Prudential plc"", ""label"": ""ORG"", ""start"": 161, ""end"": 175']",Employee-Company,"(e1,e2)"
3,5,"Career: <e1>Mark</e1> was previously Group Chief Executive and President of AIA Group Limited (AIA). Prior to joining AIA, he held various senior management roles with Prudential plc, including as Group Chief Executive for four years. He served on <e2>Prudential</e2>s Board for 10 years.","['""text"": ""Mark"", ""label"": ""PERSON"", ""start"": 8, ""end"": 12', '""text"": ""Prudential"", ""label"": ""ORG"", ""start"": 241, ""end"": 251']",Employee-Company,"(e1,e2)"
4,5,"Career: Mark was previously Group Chief Executive and President of <e1>AIA Group Limited</e1> (AIA). Prior to joining <e2>AIA</e2>, he held various senior management roles with Prudential plc, including as Group Chief Executive for four years. He served on Prudentials Board for 10 years.","['""text"": ""AIA Group Limited"", ""label"": ""ORG"", ""start"": 67, ""end"": 84', '""text"": ""AIA"", ""label"": ""ORG"", ""start"": 111, ""end"": 114']",Same-Entity,"(e1,e2)"


In [None]:
index = 1
q=1
output_train_file = open("/content/drive/MyDrive/GitHub/train_data_dm1.txt", "w")
output_test_file  = open("/content/drive/MyDrive/GitHub/test_data_dm1.txt", "w")
NoOfRow=list(input_file.index)[-1]

#shuffle data
input_file = input_file.sample(frac=1)

for row in range(0,NoOfRow):
  temp = input_file.iloc[row]
  #test set
  if row > 0.9* NoOfRow:

    if temp.isna()['Relationship']: #if the relation is other
      print(8000+q, '        \"', temp.newText, '\"\n', temp.Label, '\nComment: \n', sep="", file=output_test_file)
    else:
      print(8000+q, '        \"', temp.newText, '\"\n', temp.Label, temp['Relationship'], '\nComment: \n', sep="", file=output_test_file)
    q=q+1 

  #train set
  elif temp.isna()['Relationship']:
    print(index, '        \"', temp.newText, '\"\n', temp.Label, '\nComment: \n', sep="", file=output_train_file)
  else:
    print(index, '        \"', temp.newText, '\"\n', temp.Label, temp['Relationship'], '\nComment: \n', sep="", file=output_train_file)
  
  index=index+1

output_train_file.close()
output_test_file.close()

In [None]:
with open("/content/drive/MyDrive/GitHub/train_data_dm1.txt") as output_train_file:
    head = [next(output_train_file) for x in range(4)]
print(head)
output_train_file.close()

['1        "<e1>YE Jianping</e1> Orient Overseas Container Line Limited Director and Member of <e2>Executive Committee</e2> November 2015 N/A"\n', 'Other\n', 'Comment: \n', '\n']


In [None]:
distribution=input_file.groupby("Label").count()
distribution=distribution.iloc[:,0:1]
distribution.rename(columns={"Unnamed: 0": "sample size"},inplace=True)
df2=pd.DataFrame({'relationship_label': [" Colleague"," Relative"," Employee-Company","Educated-Institute","Founder-Company" ,
                      " Shareholder-Company" , "within the same company group" , "Cooperate partner", "Subsidary-Parent company"], 'Relationship': [1.0,2.0,3.0,4.0,5.0,7.0,8.0,9.0,11.0]})

In [None]:
distribution

Unnamed: 0_level_0,Stock
Label,Unnamed: 1_level_1
Colleague,446
Cooperate-Partner,63
Educated-Institute,74
Employee-Company,376
Founder-Company,21
Other,1389
Relative,1
Same-Entity,97
Shareholder-Company,88
Subsidary-ParentCompany,50
