In [1]:
import pandas as pd
from collocate_replacer import CollocateMatrix, PMI, Tokenizer
from collocate_replacer import most_distant_same_pos
from gensim.models import KeyedVectors
from tqdm import tqdm_notebook

In [2]:
df = pd.read_csv("unsupervised_news.csv")

In [3]:
tokenizer = Tokenizer(method='spacy_upos')

In [5]:
pmi = PMI()

In [6]:
cm = CollocateMatrix(3, tokenizer, pmi)

In [7]:
titles = df['title'].dropna()

In [8]:
is_str = lambda x: type(x) == str
titles = titles[titles.apply(is_str)==True].str.lower()

Нужно было всё привести к нижнему регистру, потому что модель uncased

In [59]:
cm.train(titles.values)

Training N-gram matrix...
Tokenizing...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=361196.0), HTML(value='')))


Calculating n-gram frequencies...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2212223.0), HTML(value='')))


Calculating metric...


In [12]:
cm.get_collocations(titles[0].lower())

[((2, 3), 'confirm_VERBas_SCONJ', 3.527744869294967),
 ((3, 4), 'as_SCONJeducation_NOUN', 3.360300388374646),
 ((4, 5), 'education_NOUNsecretary_NOUN', 6.899823330988027),
 ((9, 10), 'cast_VERBhistoric_ADJ', 5.46889179141474),
 ((10, 11), 'historic_ADJtie_NOUN', 4.580032714001256)]

In [11]:
titles[0]

'Betsy DeVos Confirmed as Education Secretary, With Pence Casting Historic Tie-Breaking Vote'

In [60]:
cm.save(cm, 'CM_SpaCy')

In [9]:
cm1 = CollocateMatrix(3, tokenizer, pmi)
cm1.load("CM_SpaCy")

In [10]:
cm1.get_collocations(titles[0], thresh=0)

[((0, 1), ('betsy_PROPN', 'devos_PROPN'), 9.298342026997148),
 ((1, 2), ('devos_PROPN', 'confirm_VERB'), 5.5508847123890375),
 ((2, 3), ('confirm_VERB', 'as_SCONJ'), 3.2203468732129714),
 ((3, 4), ('as_SCONJ', 'education_NOUN'), 3.143938027065899),
 ((4, 5), ('education_NOUN', 'secretary_NOUN'), 6.395061522399001),
 ((5, 6), ('secretary_NOUN', ',_PUNCT'), 0.9008710183501698),
 ((7, 8), ('with_ADP', 'penny_NOUN'), 1.491382998988028),
 ((8, 9), ('penny_NOUN', 'cast_VERB'), 3.682011797848684),
 ((9, 10), ('cast_VERB', 'historic_ADJ'), 4.983888817155992),
 ((10, 11), ('historic_ADJ', 'tie_NOUN'), 4.387803349439177),
 ((11, 12), ('tie_NOUN', '-_PUNCT'), 1.0584752321335404),
 ((12, 13), ('-_PUNCT', 'break_VERB'), 0.503662962084169),
 ((13, 14), ('break_VERB', 'vote_NOUN'), 1.5859149676470743)]

In [11]:
wv_model = KeyedVectors.load_word2vec_format("../gensim_models/udpipe_wikipedia/model.bin",
                                            binary = True)

In [44]:
def replace_collocates(s, cm, wv_model, dist_thresh=0.65, colloc_thresh=2.0):
    collocations = cm.get_collocations(s, thresh=colloc_thresh)
    collocations = sorted(collocations, key=lambda x: x[2], reverse=False)
    output = []
    
    for colloc in collocations:
        word_a, word_b = colloc[1]
        
        if not word_a.split('_')[-1] in ('PART','CCONJ', 'SCONJ', 'ADP','AUX','DET','PRON','PUNCT','NUM') and\
        not word_b.split('_')[-1] in ('PART','CCONJ', 'SCONJ', 'ADP', 'AUX', 'DET','PRON','PUNCT','NUM'):
            pass
        else:
            continue
        
        try:
            candidates = most_distant_same_pos(word_b, wv_model, thresh=dist_thresh)
        except KeyError:
            continue
        for candidate, candidate_dist in candidates:
            strength = cm.collocation_strength(word_a, candidate)
            if strength is not None:
                if strength > colloc_thresh:
                    output.append((colloc[0], (word_a, word_b, colloc[2]), (word_a, candidate, strength),
                                  candidate_dist))
        
    return output

In [46]:
output = []

for title in tqdm_notebook(titles[:3000], total=3000):
    unit = replace_collocates(title, cm1, wv_model, dist_thresh=0.55)
    if unit:
        output.append((title, unit))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [30]:
cm1.collocation_strength('prime_ADJ','minister_NOUN')

8.193875694606945

In [47]:
len(output) == 0

False

In [48]:
len(output)

448

In [None]:
output

In [78]:
len(titles)

361196

In [51]:
df = []

for sent_id, item in tqdm_notebook(enumerate(output), total=len(output)):
    for edit in item[1]:
        df.append({
            'sent_id': sent_id,
            'sent': item[0],
            'span': edit[0],
            'original_collocation': edit[1][:-1],
            'collocation_strength': edit[1][-1],
            'changed_collocation': edit[2][:-1],
            'changed_strength': edit[2][-1],
            'cosine_distance': edit[3],
            'Score': None
        })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=448.0), HTML(value='')))




In [52]:
df = pd.DataFrame(df)

In [53]:
df.head()

Unnamed: 0,sent_id,sent,span,original_collocation,collocation_strength,changed_collocation,changed_strength,cosine_distance,Score
0,0,melania trump says white house could mean mill...,"(5, 6)","(could_VERB, mean_VERB)",3.98515,"(could_VERB, resurface_VERB)",4.485685,0.569322,
1,1,multi-state manhunt in southeast intensifies f...,"(1, 2)","(-_ADJ, state_ADJ)",5.594272,"(-_ADJ, natal_ADJ)",6.692884,0.593754,
2,1,multi-state manhunt in southeast intensifies f...,"(1, 2)","(-_ADJ, state_ADJ)",5.594272,"(-_ADJ, estrogen_ADJ)",6.692884,0.562976,
3,2,dept. of justice cites 'national security' in ...,"(12, 13)","(trump_ADJ, immigration_NOUN)",2.127145,"(trump_ADJ, react_NOUN)",3.197205,0.564675,
4,2,dept. of justice cites 'national security' in ...,"(12, 13)","(trump_ADJ, immigration_NOUN)",2.127145,"(trump_ADJ, mount_NOUN)",2.617081,0.564639,


In [55]:
len(df.sent_id.unique())

448

In [57]:
len(df)

752

In [26]:
!pip install openpyxl

You should consider upgrading via the 'c:\users\k1l77\desktop\182e~1\term_p~1\scripts\python.exe -m pip install --upgrade pip' command.


Collecting openpyxl
  Downloading openpyxl-3.0.6-py2.py3-none-any.whl (242 kB)
Collecting et-xmlfile
  Using cached et_xmlfile-1.0.1-cp37-none-any.whl
Collecting jdcal
  Using cached jdcal-1.4.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: jdcal, et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.0.1 jdcal-1.4.1 openpyxl-3.0.6


In [58]:
df.to_excel("Changed_collocations.xlsx")

Немного изменим концепт - будем заменять не один коллокат на другой, а второе слово в любой паре слов на семантически далёкий коллокат

In [37]:
def replace_words(s, cm, wv_model, dist_thresh=0.65, colloc_thresh=2.0, tokenizer=tokenizer):
    output = []
    sents = tokenizer(s)
    for tokens in sents:
        word_pairs = [((i,j), (tokens[i],tokens[j]), cm.collocation_strength(tokens[i],tokens[j])) for i,
                      j in zip(list(range(len(tokens))),list(range(1,len(tokens))))]

        for colloc in word_pairs:
            word_a, word_b = colloc[1]

            if not word_a.split('_')[-1] in ('PART','CCONJ', 'SCONJ', 'ADP','AUX','DET','PRON','PUNCT','NUM') and\
            not word_b.split('_')[-1] in ('PART','CCONJ', 'SCONJ', 'ADP', 'AUX', 'DET','PRON','PUNCT','NUM'):
                pass
            else:
                continue

            try:
                candidates = most_distant_same_pos(word_b, wv_model, thresh=dist_thresh)
            except KeyError:
                continue
            for candidate, candidate_dist in candidates:
                strength = cm.collocation_strength(word_a, candidate)
                if strength is not None:
                    if strength > colloc_thresh:
                        output.append((colloc[0], (word_a, word_b, colloc[2]), (word_a, candidate,strength),
                                      candidate_dist))
        
    return output

In [38]:
result = []

for title in tqdm_notebook(titles[:100], total=100):
    unit = replace_words(title, cm1, wv_model, dist_thresh=0.55)
    if unit:
        result.append((title, unit))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [45]:
result[0]

('betsy devos confirmed as education secretary, with pence casting historic tie-breaking vote',
 [((13, 14),
   ('break_VERB', 'vote_NOUN', 1.5859149676470743),
   ('break_VERB', 'deliveryman_NOUN', 7.5632805903755544),
   0.6047073602676392)])

In [46]:
len(result)

16

In [47]:
result

[('betsy devos confirmed as education secretary, with pence casting historic tie-breaking vote',
  [((13, 14),
    ('break_VERB', 'vote_NOUN', 1.5859149676470743),
    ('break_VERB', 'deliveryman_NOUN', 7.5632805903755544),
    0.6047073602676392)]),
 ('melania trump says white house could mean millions for brand',
  [((5, 6),
    ('could_VERB', 'mean_VERB', 3.985150443992131),
    ('could_VERB', 'resurface_VERB', 4.485685376176988),
    0.5693224668502808)]),
 ('multi-state manhunt in southeast intensifies for alleged murderer and accomplice',
  [((1, 2),
    ('-_ADJ', 'state_ADJ', 5.594271722145067),
    ('-_ADJ', 'natal_ADJ', 6.6928840108131755),
    0.5937535166740417),
   ((1, 2),
    ('-_ADJ', 'state_ADJ', 5.594271722145067),
    ('-_ADJ', 'estrogen_ADJ', 6.6928840108131755),
    0.5629760026931763)]),
 ("dept. of justice cites 'national security' in calling for trump immigration order reinstatement",
  [((10, 11),
    ('trump_ADJ', 'immigration_NOUN', 2.1271448616905015),
    ('

In [42]:
df = []

for sent_id, item in tqdm_notebook(enumerate(output), total=len(output)):
    for edit in item[1]:
        df.append({
            'sent_id': sent_id,
            'sent': item[0],
            'span': edit[0],
            'original_collocation': edit[1][:-1],
            'collocation_strength': edit[1][-1],
            'changed_collocation': edit[2][:-1],
            'changed_strength': edit[2][-1],
            'cosine_distance': edit[3],
            'Score': None
        })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=756.0), HTML(value='')))




IndexError: invalid index to scalar variable.

In [30]:
output[5]

('as trump fears fraud, gop eliminates election commission',
 [((13, 14),
   ('break_VERB', 'vote_NOUN', 1.5859149676470743),
   ('break_VERB', 'deliveryman_NOUN', 7.5632805903755544),
   0.6047073602676392),
  ('betsy devos confirmed as education secretary, with pence casting historic tie-breaking vote',
   [...]),
  ((5, 6),
   ('could_VERB', 'mean_VERB', 3.985150443992131),
   ('could_VERB', 'resurface_VERB', 4.485685376176988),
   0.5693224668502808),
  ('melania trump says white house could mean millions for brand', [...]),
  (...),
  ("appeals court to decide on challenge to trump's immigration executive order",
   [...]),
  ('at least 4 tornadoes reported in southeast louisiana', [...]),
  ('mother of backpacker slain in australia criticizes trump', [...]),
  ("trump's labor secretary pick andrew puzder admits to employing undocumented worker",
   [...]),
  ("iran's top leader mocks 'newcomer' trump", [...]),
  ('eu to britain: pay up for what you ordered before leaving', [...])

In [17]:
df = pd.DataFrame(df)

In [18]:
df.head()

Unnamed: 0,sent_id,sent,span,original_collocation,collocation_strength,changed_collocation,changed_strength,cosine_distance,Score
0,0,melania trump says white house could mean mill...,"(5, 6)","(could_VERB, mean_VERB)",3.98515,"(could_VERB, resurface_VERB)",4.485685,0.569322,
1,1,multi-state manhunt in southeast intensifies f...,"(1, 2)","(-_ADJ, state_ADJ)",5.594272,"(-_ADJ, natal_ADJ)",6.692884,0.593754,
2,1,multi-state manhunt in southeast intensifies f...,"(1, 2)","(-_ADJ, state_ADJ)",5.594272,"(-_ADJ, estrogen_ADJ)",6.692884,0.562976,
3,2,dept. of justice cites 'national security' in ...,"(12, 13)","(trump_ADJ, immigration_NOUN)",2.127145,"(trump_ADJ, react_NOUN)",3.197205,0.564675,
4,2,dept. of justice cites 'national security' in ...,"(12, 13)","(trump_ADJ, immigration_NOUN)",2.127145,"(trump_ADJ, mount_NOUN)",2.617081,0.564639,
