In [48]:
import pandas as pd
import io
import string
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [46]:
path = "/content/f1.csv"
df = pd.read_csv(path)

In [47]:
df.drop(['Opening Text','Source','Subregion','Language','Desktop Reach','Mobile Reach','Twitter Social Echo','Facebook Social Echo','Reddit Social Echo','National Viewership','Engagement','Keywords','Tweet Id','Twitter Id','Twitter Client','Document Tags','URL'], axis='columns', inplace=True)

In [12]:
df.head(3)

Unnamed: 0,Hit Sentence,Influencer,Country,Reach,AVE,Sentiment,Key Phrases,Input Name,Twitter Authority,Twitter Screen Name,Twitter User Profile Url,Twitter Bio,Twitter Followers,Twitter Following,Alternate Date Format,Time,State,City
0,QT @NewsAtNESA: RT @smitchellmlc: Planning beg...,@scottsdavidson,Australia,2364,21.87,Neutral,"NESA guidelines,disruption,plans,schools,schoo...",Adhoc Search Export,6,Scott s Davidson,https://twitter.com/ScottsDavidson,Director Strategic Liaison Office of the NSW M...,2364,1251,15-Sep-20,5:04 PM,New South Wales,
1,QT @FergalBowers: RT @Antcon7062: Great work p...,@tiernster,Australia,255,2.36,Positive,"Great work principals,teachers, parents and kids",Adhoc Search Export,4,Ciaran Tiernan,https://twitter.com/tiernster,Software developer. \nWeakness for Clare hurli...,255,582,10-Sep-20,7:05 AM,South Australia,Clare
2,Principals frustrated as more COVID cases conf...,@duncanmacmartin,Australia,4687,43.35,Neutral,,Adhoc Search Export,7,üíßDuncan MacMartin‚è≥üáµüá∏,http://www.twitter.com/duncanmacmartin,Researcher in cognitive/empathetic development...,4687,4458,10-Sep-20,1:47 AM,Victoria,Melbourne


In [49]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
df.loc[0,'Hit Sentence']

'QT @NewsAtNESA: RT @smitchellmlc: Planning begins for schools to make sure HSC exams go ahead with the least amount of disruption possible. All schools will have NESA guidelines to follow and we will be supporting our Principals to design and implement their plans. ; What‚Äôs your school‚Äôs COVID-19 contingency plan? Guidelines to help schools implement health and safety measures and plan for contingencies have been issued. It‚Äôs all about making sure students can sit their HSC without disruption #2020HSC #covidsafeHSC https://t.co/HwB44Iu2sR https://t.co/Q7F9aHtyt5'

In [51]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [52]:
def clean_text(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
df['Hit Sentence'] = df['Hit Sentence'].apply(lambda x:clean_text(x))
df.loc[0,'Hit Sentence']


'QT NewsAtNESA RT smitchellmlc Planning begins for schools to make sure HSC exams go ahead with the least amount of disruption possible All schools will have NESA guidelines to follow and we will be supporting our Principals to design and implement their plans  What‚Äôs your school‚Äôs  contingency plan Guidelines to help schools implement health and safety measures and plan for contingencies have been issued It‚Äôs all about making sure students can sit their HSC without disruption  covidsafeHSC  '

In [53]:
def get_entities(sent):
 
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  
  
  for tok in nlp(sent):
    
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  
  return [ent1.strip(), ent2.strip()]

In [54]:
get_entities("qt newsatnesa rt smitchellmlc planning begins schools make sure hsc exams go ahead least amount disruption possible schools nesa guidelines follow supporting principals design implement plans school covid 19 contingency plan guidelines help schools implement health safety measures plan contingencies issued making sure students sit hsc without disruption 2020hsc covidsafehsc")

['measures plan students', 'disruption']

In [55]:
entity_pairs = []

for i in tqdm(df["Hit Sentence"]):
  entity_pairs.append(get_entities(i))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:00<00:00, 33.50it/s]


In [42]:
entity_pairs[10:]

[['measures plan students', 'disruption'],
 ['measures plan students', 'disruption'],
 ['presiding leadership officers', 'readiness exams pandemic'],
 ['measures plan students', 'disruption'],
 ['presiding leadership officers', 'readiness exams pandemic'],
 ['measures plan students', 'disruption'],
 ['presiding leadership officers', 'readiness exams pandemic'],
 ['presiding leadership officers', 'readiness exams pandemic'],
 ['presiding leadership officers', 'readiness exams pandemic'],
 ['presiding leadership officers', 'readiness exams pandemic'],
 ['covid  cases', 'calgary schools calgary herald'],
 ['covid19 schools cork case', 'eglantine southside cork city'],
 ['dedicated teachers principals', 'hungry  victoria'],
 ['principals', 'hungry  victoria'],
 ['parents', 'skinnergj']]