In [1]:
import os
import pandas as pd
import pickle
from string import punctuation
import nltk
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp  import CoreNLPParser
from nltk.tree import ParentedTree
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('wordnet') 
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cheri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def triplet_extraction (parse_tree):
    
    subject = extract_subject(parse_tree)
    predicate = extract_predicate(parse_tree)
    objects = extract_object(parse_tree)
   
    return [subject, predicate, objects]


def extract_subject (parse_tree):
    
    subject = []
    for s in parse_tree.subtrees(lambda x: x.label() == 'NP'):
        for t in s.subtrees(lambda y: y.label().startswith('NN')):
            output = t[0]
            if output != [] and output not in subject:
                subject.append(output) 
    if len(subject) != 0: return subject[0] 
    
    else: return ''

def extract_predicate (parse_tree):

    output, predicate = [],[]
    for s in parse_tree.subtrees(lambda x: x.label() == 'VP'):
        for t in s.subtrees(lambda y: y.label().startswith('VB')):
            output = t[0]
            if output != [] and output not in predicate:    
                predicate.append(output)
    if len(predicate) != 0: return predicate[-1]
    else: return ''



def extract_object (parse_tree):

    objects, output, word = [],[],[]
    for s in parse_tree.subtrees(lambda x: x.label() == 'VP'):
        for t in s.subtrees(lambda y: y.label() in ['NP','PP','ADJP']):
            if t.label() in ['NP','PP']:
                for u in t.subtrees(lambda z: z.label().startswith('NN')):
                    word = u 
            else:
                for u in t.subtrees(lambda z: z.label().startswith('JJ')):
                    word = u
            if len(word) != 0:
                output = word[0]
            if output != [] and output not in objects:
                objects.append(output)
    if len(objects) != 0: return objects[0]
    else: return ''



def text_preprocess(text):
    tokenizer  = nltk.tokenize.punkt.PunktSentenceTokenizer()
    parser = CoreNLPParser(tagtype='pos')
    sentences = tokenizer.tokenize(text);
    result = []
    for sentence in sentences:
        parse_tree, = ParentedTree.convert(list(parser.parse(sentence.split()))[0])
        result.append(triplet_extraction(parse_tree)) 
    
    return result,sentences

def stemming(triplets):
    lemmatizer = WordNetLemmatizer()
    for triplet in triplets:
        triplet[1] = lemmatizer.lemmatize(triplet[1],pos='v')
    return triplets

def process_sentences(sentences):
    lemmatizer = WordNetLemmatizer()
    to_remove = stopwords.words('english') + list(punctuation) + ["``","'s","'d","''","'"]
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = word_tokenize(sentence)
        tokens = filter(lambda token: token not in to_remove, tokens)
        tokens = map(lambda token: lemmatizer.lemmatize(token,pos='v'),tokens)
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

In [4]:
if __name__ == '__main__':
    
    java_path = "C:/Program Files/Java/jdk-15.0.1/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    cn_path = "D:/Learning Material/Project/Actual work/corenlp/stanford-corenlp-4.2.0/stanford-corenlp-4.2.0.jar"
    model_path= "D:/Learning Material/Project/Actual work/corenlp/stanford-corenlp-4.2.0/stanford-corenlp-4.2.0-models.jar"
    df = pd.read_csv('dataset//dataset.csv')
    
    server = CoreNLPServer(cn_path,model_path)
    triplets = []
    processed_sentences = []
    sentences_list = []
    server.start()
    for txt in df['Content']:
        output,sentences = text_preprocess(txt)
        output = stemming(output)
        sent = []
        for x in output:
            sent.append(' '.join(x))
        processed_sentences.append(process_sentences(sentences))
        triplets.append(sent)
        sentences_list.append(sentences)
    server.stop()
    df['Sentences'] = sentences_list
    df['Triplets'] = triplets
    df['Processed'] = processed_sentences

In [5]:
with open('pickles/test.pickle','wb') as out:
    pickle.dump(df,out)

In [None]:
# Shorter news article

txt2 ="""There is no available data so far which suggests that the double mutant strain of coronavirus is resistant to vaccines, said Soumya Swaminathan, the Chief Scientist of the World Health Organisation (WHO), on May 10.

Swaminathan's remarks came a day after sections of the Indian media reported her as saying that the double mutant is potentially resistant to the available vaccines.

Clarifying her view, the top WHO official told CNBC TV-18 that there is "no data" which suggests that the B.1.617 variant of coronavirus is capable of evading the immunity provided through the vaccines.

"All available vaccines reduce severity of infection," Swaminathan told the news channel, adding that even if a person contracts COVID-19 after getting vaccinated, the infection is mild in most cases. On being asked whether the double mutant has driven India's second pandemic wave, Swaminathan said the interim data suggests the role of B.1.617 in the sharp spike in caseload. However, there is "not enough data" to completely establish the link.

"For now, it seems to be more contagious, causing more infections - but we do not have enough data. We look forward to that data coming out from India," she said."""

In [None]:
# Long news article

txt1 = """It was a moment he'd dreamt of since being inspired to take up the sport as a five-year-old. It was one of the iconic moments of the London Olympics.

But the moment was not his.

Instead it was his friend Jade Jones who claimed Britain's first ever Olympic taekwondo gold while he was watching from the stands, having been sensationally and highly controversially overlooked for the Team GB squad.

"You can never say I would have won gold myself," says Cook, who had retained his European crown just days before the Olympic team announcement and was world number one in his -80kg category.

"I was in the form of my life though. It felt like my time - I had beaten most of the fighters there and had the belief."

The omission left him "broken" emotionally. What followed broke him financially, too.

He and his family spent their entire life savings - "hundreds of thousands of pounds" - in contesting the selectors' decision to promote then little-known fighter Lutalo Muhammad ahead of him.

They ultimately failed in that quest. But salvation - monetary, if not spiritual - would come, in highly unusual circumstances. A Moldovan telecommunications billionaire made him an offer he could not refuse.

It's just one of many surprise twists that Cook, now aged 30, has experienced in his "rollercoaster" career.

His has been a unique path in sport - and there's a golden ending still in sight.

Short presentational grey line
It is hard to overstate the excitement generated by Cook's emergence. And he is still among taekwondo's biggest draws, with London 2012 champion Jones describing him as an "icon" of the sport.

A year after narrowly missing out on Beijing Olympic bronze at the age of 17, he fully announced his devastating potential in 2009 when dramatically defeating the legendary Steven Lopez, a five-time world champion. It signalled his arrival as a global star. He became not only a poster boy for the sport, but also for London 2012.

"After that my profile just exploded," Cook recalls with a smile.

"It gave me so much belief, that I could beat the sport's greatest, one of my idols, and I've taken that confidence throughout the rest of my career."

However, as the Games in London approached, that profile began creating problems.British sponsors were desperate to piggyback on the growing excitement, and Cook was one of several young talents wooed with lucrative contracts.

"It was just a mad and super-exciting time. I was in adverts, on cereal packets, meeting my heroes and then there was the deal with the Power Rangers.

"I became an official Red Ranger, which as someone who'd watched them as a kid was just unbelievable and there was even talk that if I won gold [in London] I might star in the next movie as the Gold Ranger."

The commercials and photoshoots ate into his time. Such commitments were of secondary concern to GB Taekwondo which, from its humble base within an Asda superstore opposite the Manchester City stadium, was trying to best prepare him for competition.

A balance was found, but tensions between Cook's representatives and the sport's management team did not go away.

"[GB Taekwondo] had ideas, as did my agents and parents, so there was definitely conflict and it just came to a head," says the former world junior champion.

By spending time with fellow Adidas athletes like Mo Farah and Tom Daley, Cook learned how other sports had more flexible approaches to training programmes, and he liked what he heard.

He felt his programme wasn't personalised enough, while GB Taekwondo believed he needed to focus on developing greater tactical variety to his game, fearing his explosive style was becoming predictable.

Cook was trialled with several different coaches in the British set-up, but after a run of poor performances that culminated in an "embarrassing" exit in his first fight at the 2011 World Championships, he took the dramatic decision to leave the GB programme.

"I made the decision to really attack it as a professional athlete and get a team around me that was specific to my needs," he says.

"The money from my sponsors allowed me to hire not only my own strength and conditioning team, but to find the best coaches and best fighters to train with."

GB Taekwondo insisted the "door always remained open" for Cook to return. And he did so on a temporary basis two weeks before the crucial 2012 European Championships in Manchester - which served as the final qualification event for the London Games.

"I was enjoying training, the results were great and even going back into the GB set-up and using one of their coaches instead of mine didn't faze me," he says.

"I had this calm mindset and believed that this was going to be my Olympics."

But things would not remain calm for long.Cook's performances at those Euros were steady rather than spectacular. He did "just enough" to edge each contest and land a place in the -80kg final against arch rival Ramin Azizov of Azerbaijan.

It took a controversial last-second disqualification for him to claim gold, but he had retained his European crown and thought the Olympic place was his.

Cook was so focused that Muhammad's victory in the -87kg division the evening before had barely registered.

Less than a week later he received the "devastating" news that he was to be the -80kg reserve - Muhammad's understudy - for the Olympic Games.

The British Olympic Association (BOA) initially refused to ratify Muhammad's selection and questioned why Cook, who had just reclaimed the world number one ranking, was being overlooked.

At the time Cook's camp believed his decision to leave the GB Taekwondo set-up must have been a factor, something the selectors and management were at pains to deny.

The bitter battle became headline news for over a month and only ended when the Cooks were told they would need £500,000 to take the case to the High Court, a move that could have cost his parents their home if they had lost.

"We knew in our hearts that we could win, but the risk was too big," says Cook.

"I want to say though that it was never anything against Lutalo - he took the chance and fair play he went and won Olympic bronze [Muhammad also won silver at Rio 2016]. But looking back, it's a shame the rules wouldn't allow us both to go."

At the time, however, Cook was feeling far from philosophical. He said he had been "cheated" and insisted he would never compete for Britain again while the current management team was in place. He found himself dropped by all of his sponsors.

"After the Games, for about six months I went through a real grieving process - I got fat and was depressed," he says.

"When I finally snapped out of it I began to rediscover the love for taekwondo again, even if my parents weren't too happy.

"I think after everything we'd all been through they'd have loved me to go into football, boxing or even UFC and at that time, aged 21, I might have gone for it if there had been a serious offer."

Cook would return to the taekwondo mat, but true to his word, never again in British colours.

The Isle of Man offered an unorthodox, but immediate way back to the international stage - although it would restrict him to non-Olympic competition.

"Even though I couldn't represent them at the Olympics I was grateful and I'd almost come to terms with it, thinking that perhaps competing at all the other major events would be enough for me," he tells BBC Sport.

Then in 2014 came an unexpected twist.

Just hours after securing his third European gold - and the Isle of Man's first-ever major honour in taekwondo - Cook was ushered into a poorly lit backroom at the host arena in Baku, capital of Azerbaijan. The Moldovan taekwondo president wanted to speak with him.

Two weeks later at a London steak restaurant they met again, and Igor Iuzefovici repeated the offer. He would arrange Moldovan citizenship. There was big money involved. It would open a return to the Olympics.

"It was so surreal, amazing and crazy because I'd gone from being the lowest in my life to being like 'wow this could be a reality again' and they offered to pay for everything," Cook says.

"My parents had taken on all of the financial burden since London and I saw this as a way of paying them back."

Once word got out that Cook was on the 'taekwondo transfer market', rival offers came in. Qatar, the UAE, Kazakhstan and Turkey were soon presenting their own terms to his team.

"It became this bidding war, which was exciting, but I was adamant that I would only choose a country who didn't have a fighter in my division because I didn't want to take an opportunity from someone who had grown up there," Cook says.

The BOA initially declined his bid to be released. But after meeting the fighter and realising his mind was set they reluctantly agreed to his request. In April 2015 he was officially presented with a Moldovan passport.

"This didn't feel like taekwondo," says Cook. "The contract said I'd receive around £100,000 if I won Olympic gold. I felt like a footballer who'd just been signed by Manchester United."Cook had enjoyed the best year of his career with seven successive titles in 2014 and he claimed his first World Championship medal the following year.

Bronze was not enough to appease his billionaire backer, however.

"Initially everything like coaching, training camps and tournaments were all paid for but then he started throwing his toys out of the pram and all the curveballs started to mess with my head," says Cook.

"Looking back maybe I should have fought more, but I was conscious of what I'd been through ahead of London and I just played ball because I knew I'd get this opportunity at the Olympics."

However, Rio 2016 would not be the Games he dreamt of. Seeded second, he made a disappointing first-round exit to a fighter he was expected to beat easily.

"I've been guilty of putting too much pressure on myself at various points throughout my career and this was one of them," he says.

"There was so much going on with Moldova, my parents, my brother, the relationship with my coach, and of course I wanted to show GB what they were missing.

"I wasn't in a good place and didn't perform."

Two weeks later he received an email informing him that his contract, which had been due to run until Tokyo 2020, was being terminated.

After "many tears", a lot of soul-searching and going back to the bank of mum and dad, he found a way to fight on and 12 months later secured a second bronze at the 2017 World Championships, still competing for Moldova. He was named the country's athlete of the year.

But he rarely spends time there. Since then he has mainly been training by himself at local gyms close to his home in Manchester and it is now three years since his last major international medal, a European silver. The terms of his original contract - including that Olympic bonus - no longer stand.

Two hamstring tears and knee ligament problems saw him narrowly miss out on automatic qualification for the Tokyo Games, but the postponement caused by the Covid-19 pandemic has brought a different perspective - and new prospects.

While in lockdown with his girlfriend Bianca Walkden, the three-time world champion in the +73kg class, and her housemate double Olympic champion Jones, Cook regularly trained with the pair. After joining them on a training camp in Croatia last year he is now employed by GB Taekwondo as an official sparring partner.

"It's crazy how it's gone full circle really, isn't it?" he says, laughing.

"It's surreal being here though because when I left the GB academy in 2011, I didn't have the freedom to lead my own programme, but now it's so individualised for each athlete and if it had been like this 10 years ago I probably wouldn't have gone down the route I did.

"I wouldn't say I'll ever forgive or forget, but years have gone by and maybe it was the pandemic, getting older or the world changing, but I came to an understanding that I can't live in the past and I need to move forward."

GB Taekwondo performance director Gary Hall says there is "no malice" towards Cook from their side.

"He's a good lad, trains hard, he's dynamically very quick and offers a really good challenge for our athletes as a training or sparring partner, and Bianca is feeling the benefits of that," he adds.

"We've been happy for him to perform that role, train at the academy and we do hope he goes to Tokyo and wins a medal."

Because now comes the next big test. One that could set up a potentially Hollywood-esque golden goodbye.

To qualify for the Tokyo Olympics in July he must reach the final at the European qualification event in Bulgaria on Friday.

If he makes it, it'll be in his own inimitable style.

"I've always been a massive Rocky fan because he's a streetfighter like me and even when he's down on points he always has that knockout shot which can change everything," says Cook.

"It's happened in my career before. And I know I have at least one left in me." """