In [2]:
import numpy as np
import pandas as pd
import json
import pickle
import heapq
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import norm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from nltk.tokenize import TweetTokenizer
import re
import sys
import random
import heapq
import ipynb.fs  
sys.path.append("../")
from .defs.get_abstract_2 import count_shared_papers

In [5]:
with open('../MADStat-dataset-final-version/data.json') as json_file:
    data = json.load(json_file)
    
'''load list of authors'''
with open('../author_name.txt') as f:
    authors = f.readlines()
authors = [author.strip() for author in authors]

'''load papers info'''
papers = pd.read_csv("../paper.csv")

"""load list of authors having at least 30 papers"""
with open("../../authors","rb") as fp:
    author_l = pickle.load(fp)

  papers = pd.read_csv("../paper.csv")


In [26]:
def HC(pvals, gamma=0.2, thresh=0.4):
    pvals = np.sort(pvals[pvals <= thresh])
    N = len(pvals)
    hc = -1000
    i_star = 0
    for i in range(1,int(gamma*N)+1):
        if pvals[i-1] >= 1/N:
            num = np.sqrt(N)*((i/N) - pvals[i-1])
            den = np.sqrt((i/N)*(1-i/N))
            cur = num/den
            if cur > hc:
                hc = cur
                i_star = i
    return hc, i_star

In [7]:
def clean_text(data) :
        #data.text = data.text.apply(remove_hexa_symbols)
        #data.text = data.text.apply(remove_digits)
        data = data.filter(['author', 'title', 'text']).rename(columns = {'title' : 'doc_id'})
        data["len"] = data.text.apply(lambda x: len(x))
        data.text = data.text.apply(lambda x: re.sub("All rights","",x))
        data.text = data.text.apply(lambda x: re.sub("reserved","",x))
#         data.text = data.text.apply(lambda x: re.sub("[0-9]","",x))
        data.text = data.text.apply(lambda x: re.sub("[^A-Za-z ]","",x))
        data.text = data.text.apply(lambda x: re.sub("copyright","",x))
        data.text = data.text.apply(lambda x: x.lower())
        data = data.loc[data.len > 10].reset_index()
        data.drop(columns=["len"],inplace=True)
        return data
    
def topKFrequent(nums, k):
    dic=Counter(nums)
    heapmax=[[-freq,num] for num,freq in dic.items()]
    heapq.heapify(heapmax)
    list1=[]
    for i in range(k):
        poping=heapq.heappop(heapmax)
        list1.append(poping[1])
    return list1


def get_vocab(text, max_length=200):
#     clf = CountVectorizer(lowercase=True)
#     clf.fit([text])
#     vocab = list(clf.vocabulary_.keys())
#     print("vocab before = ",vocab)
    vocab = text.split()
    k = min(max_length, len(set(vocab)))
#     return heapq.nlargest(k, vocab, key=vocab.get)
#     print(vocab)
    return topKFrequent(vocab,k)

In [8]:
"""
Input:  - text is a list of strings corresponding to documents
        - vocab is the vocabulary used for the problem
"""
def doc_to_dtm(text, vocab):
    #tk = TweetTokenizer()
    vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split(),vocabulary=vocab) #tokenizer=tk.tokenize,
#     X = vectorizer.fit_transform(text)
    X = vectorizer.transform(text)
    return X.toarray()

In [9]:
def estimate_poisson(corpus):
    return np.mean(corpus,axis=0)

In [24]:
"""Return pvals using standard normal cdf"""
def get_pvals(author1,author2,show_hist=False,thresh=0.4):
    
    def replace_labels(x):
        x[x==author1] = 1
        x[x==author2] = 0
        return x

    author_1 = pd.read_csv(f'../Data/{author1}.csv').filter(['author', 'title', 'text'])
    author_2 = pd.read_csv(f'../Data/{author2}.csv').filter(['author', 'title', 'text'])
    n, m = author_1.shape[0], author_2.shape[0]
    if author1 != author2 and count_shared_papers(author1,author2,authors,data)==0 and min(n/m, m/n) >= 1/2:   
        data_ = pd.concat([clean_text(author_1),
                                          clean_text(author_2)], ignore_index=True)

        data_train = data_.sample(frac=0.7)
        data_test = data_.drop(data_train.index)
        vocab = get_vocab(''.join([doc + " " for doc in list(data_train["text"])]), max_length=400)


        text1 = data_train[data_train["author"]==author1]
        text2 = data_train[data_train["author"]==author2]

        corpus1 = doc_to_dtm(list(text1.text),vocab=vocab)
        corpus2 = doc_to_dtm(list(text2.text),vocab=vocab)
        corpus_test = doc_to_dtm(list(data_test.text),vocab=vocab)
        
        lam_1 = estimate_poisson(corpus1)
        lam_2 = estimate_poisson(corpus2)
        
        sx = np.std(corpus1,axis=0)
        sy = np.std(corpus2,axis=0)
        z = (lam_1 - lam_2)/np.sqrt((sx**2/corpus1.shape[0]) + (sy**2/corpus2.shape[0]))
        z_n = (z - np.mean(z))/np.std(z)
        if show_hist:
            plt.hist(z_n)
            plt.title(f"Normalized z-counts for {author1} and {author2}")
            plt.show()
        pvals = 1 - norm.cdf(z_n)
        hc, i_star = HC(pvals,thresh=thresh)
        
        # Prediction on test set
        c1_hc = corpus1[:,pvals <= np.sort(pvals)[i_star]]
        c2_hc = corpus2[:,pvals <= np.sort(pvals)[i_star]]
        ct_hc = corpus_test[:,pvals <= np.sort(pvals)[i_star]]
        
        Z = evaluate(ct_hc,c1_hc,c2_hc)
        y_preds = predict(Z)
        y_true = replace_labels(np.array(data_test.author))   # 1 = author1, 0 = author2
        print(f"Accuracy on test set = {accuracy(y_preds,y_true)}")
        return y_preds, y_true
    else:
        return "One author has more than twice the number of papers as the other one !!!"
        

In [4]:
a = np.array([1,2,4])
b = np.array([4,1,9])
abs(a - b)

array([3, 1, 5])

In [69]:
author1="John Kent"
author2 = "Iain Johnstone"
count=0
for author in author_l:
    if (author != author1) & (author!=author2):
        c1 = count_shared_papers(author1,author,authors,data)
        c2 = count_shared_papers(author2,author,authors,data)
        if min(c1,c2) > 0:
            count+=min(c1,c2)
            
count

0

In [11]:
def evaluate(new_data,c1,c2):
    return new_data - ((c1.sum(axis=0) + c2.sum(axis=0))/(c1.shape[0]+c2.shape[0]))

def predict(z):
    return np.where(z.sum(axis=1) > 0, 1, 0)

def accuracy(y_preds,y_true):
    return np.mean(y_preds==y_true)

In [51]:
print(get_pvals("John Kent","Iain Johnstone"))

Accuracy on test set = 0.72
(array([0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1]), array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=object))


In [229]:
df_ = pd.DataFrame()
for i,pairs in enumerate(hard_pairs):
    author1, author2 = pairs
    y_true, y_preds = get_pvals(author1,author2)
    acc = accuracy(y_true,y_preds)
    f1 = f1_score(list(y_preds), list(y_true))
    df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
df_

  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFrame({"Author 1":author1,"Author 2":author2, "Accuracy":acc,"F1":f1},index=[i]))
  df_ = df_.append(pd.DataFr

Unnamed: 0,Author 1,Author 2,Accuracy,F1
0,Han-ying Liang,Ngai Hang Chan,0.863636,0.842105
1,Louise Ryan,Kerrie Mengersen,0.666667,0.7
2,Jerzy K. Baksalary,Zehua Chen,0.894737,0.888889
3,Enrique Schisterman,Paddy Farrington,0.857143,0.666667
4,Brian Caffo,Lyle Broemeling,0.7,0.7
5,Myles Hollander,Victor De Gruttola,0.681818,0.631579
6,Paul Gustafson,Noel Cressie,0.894737,0.857143
7,Atanu Biswas,Noël Veraverbeke,0.965517,0.956522
8,Robert Kohn,Paul Janssen,0.714286,0.692308
9,David Schoenfeld,Stuart J. Pocock,0.9,0.875


In [8]:
author1 = "Boxin Tang"
author_1 = pd.read_csv(f'../Data/{author1}.csv').filter(['author', 'title', 'text'])

In [15]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [18]:
for sentence, embedding in zip(list(author_1.text), embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding.shape)
    print("")

Sentence: Two-level fractional factorial designs are considered under a baseline   parameterization. The criterion of minimum aberration is formulated in   this context and optimal designs under this criterion are investigated.   The underlying theory and the concept of isomorphism turn out to be   significantly different from their counterparts under orthogonal   parameterization, and this is reflected in the optimal designs obtained.
Embedding: (384,)

Sentence: Orthogonal arrays with clear two-factor interactions provide a class of   designs that are robust to nonnegligible effects. If certain prior   knowledge is available, then robust designs allow additional factors to   be studied. This is done through partially clear two-factor   interactions. We study the existence and construction of such robust   designs and present an upper bound on the maximum number of clear   two-factor interactions.
Embedding: (384,)

Sentence: We introduce a method for constructing a rich class of desi

In [16]:
embeddings = model.encode(list(author_1.text))

In [13]:
len(list(author_1.text))

37

In [231]:
df_.to_csv("Hard_pairs_PHC.csv")

In [230]:
df.to_csv("Hard_pairs_chi2.csv")

In [193]:
df

Unnamed: 0,Author 1,Author 2,Accuracy,F1
0,Håvard Rue,Ross Prentice,0.921053,0.941176
0,Douglas A. Wolfe,Ian White,0.8,0.75
0,Chunsheng Ma,Els Goetghebeur,0.863636,0.88
0,M. Elizabeth Halloran,Susan Lewis,0.95,0.962963
0,Gérard Letac,Clarice R. Weinberg,1.0,1.0
0,Wolfgang Härdle,Rahul Mukerjee,0.830508,0.875
0,Friedrich Götze,Bimal Sinha,0.827586,0.871795
0,Kung-jong Lui,Hu 1 Yang,0.814815,0.814815
0,Omer Ozturk,Randy Sitter,0.913043,0.909091
0,Nicholas I. Fisher,Wolfgang Wefelmeyer,0.92,0.916667


In [228]:
if __name__ == "__main__":
#     author1 = "Aiyi Liu"
#     author2 = "David Cox"
    df = pd.DataFrame()
    hard_pairs = []
    while len(df) < 10:
        author1 = random.choice(author_l)
        author2 = random.choice(author_l)
        if author1!=author2 and count_shared_papers(author1,author2,authors,data)==0:

            author_1 = pd.read_csv(f'../Data/{author1}.csv').filter(['author', 'title', 'text'])
            author_2 = pd.read_csv(f'../Data/{author2}.csv').filter(['author', 'title', 'text'])
            n, m = author_1.shape[0], author_2.shape[0]
            if min(n/m, m/n) >= 1/2:
                data_ = pd.concat([clean_text(author_1),
                              clean_text(author_2)], ignore_index=True)

                data_train = data_.sample(frac=0.7)
                data_test = data_.drop(data_train.index)
                vocab = get_vocab(''.join([doc for doc in list(data_train["text"])]))

                text1 = data_train[data_train["author"]==author1]
                text2 = data_train[data_train["author"]==author2]

                text1
                #corpus1 = doc_to_dtm(["".join(list(text1.text))],vocab=vocab)
                corpus1 = doc_to_dtm(list(text1.text),vocab=vocab)
                corpus2 = doc_to_dtm(list(text2.text),vocab=vocab)

                lam_1 = estimate_poisson(corpus1)
                lam_2 = estimate_poisson(corpus2)
                y_pred = []
                for doc in list(data_test["text"]):
                    dtm = doc_to_dtm([doc],vocab=vocab)
                    if np.sum((dtm - lam_1)**2) < np.sum((dtm - lam_2)**2):
                        y_pred.append(author1)
                    else:
                        y_pred.append(author2)


                """Accuracy and F1 score on test set"""
                y_true = list(data_test["author"])
                y_pred = [0 if item==author1 else 1 for item in y_pred]
                y_true = [0 if item==author1 else 1 for item in y_true]
                acc = np.mean(np.array(y_pred)==np.array(y_true))
                f1 = f1_score(y_pred, y_true)
                if acc <= 0.6 and f1 <= 0.6:
                    hard_pairs.append((author1,author2))
                    print(f"TESTING {author1} AGAINST {author2}")
                    print("Accuracy on test set = ",np.mean(np.array(y_pred)==np.array(y_true)))
                    print("f1 score = ",f1_score(y_pred, y_true))
                    print("-----------------------------------------------------------------")
                    df1 = pd.DataFrame({"Author 1":author1,"Author 2":author2,"Accuracy":acc,"F1":f1},index=[0])
                    df = df.append(df1)
                    print(df)

TESTING Han-ying Liang AGAINST Ngai Hang Chan
Accuracy on test set =  0.5454545454545454
f1 score =  0.5454545454545454
-----------------------------------------------------------------
         Author 1        Author 2  Accuracy        F1
0  Han-ying Liang  Ngai Hang Chan  0.545455  0.545455
TESTING Louise Ryan AGAINST Kerrie Mengersen
Accuracy on test set =  0.5
f1 score =  0.43749999999999994
-----------------------------------------------------------------
         Author 1          Author 2  Accuracy        F1
0  Han-ying Liang    Ngai Hang Chan  0.545455  0.545455
0     Louise Ryan  Kerrie Mengersen  0.500000  0.437500
TESTING Jerzy K. Baksalary AGAINST Zehua Chen
Accuracy on test set =  0.5789473684210527
f1 score =  0.6
-----------------------------------------------------------------
             Author 1          Author 2  Accuracy        F1
0      Han-ying Liang    Ngai Hang Chan  0.545455  0.545455
0         Louise Ryan  Kerrie Mengersen  0.500000  0.437500
0  Jerzy K. Baks

  df = df.append(df1)
  df = df.append(df1)
  df = df.append(df1)


TESTING Enrique Schisterman AGAINST Paddy Farrington
Accuracy on test set =  0.5238095238095238
f1 score =  0.5454545454545454
-----------------------------------------------------------------
              Author 1          Author 2  Accuracy        F1
0       Han-ying Liang    Ngai Hang Chan  0.545455  0.545455
0          Louise Ryan  Kerrie Mengersen  0.500000  0.437500
0   Jerzy K. Baksalary        Zehua Chen  0.578947  0.600000
0  Enrique Schisterman  Paddy Farrington  0.523810  0.545455


  df = df.append(df1)


TESTING Brian Caffo AGAINST Lyle Broemeling
Accuracy on test set =  0.55
f1 score =  0.5263157894736842
-----------------------------------------------------------------
              Author 1          Author 2  Accuracy        F1
0       Han-ying Liang    Ngai Hang Chan  0.545455  0.545455
0          Louise Ryan  Kerrie Mengersen  0.500000  0.437500
0   Jerzy K. Baksalary        Zehua Chen  0.578947  0.600000
0  Enrique Schisterman  Paddy Farrington  0.523810  0.545455
0          Brian Caffo   Lyle Broemeling  0.550000  0.526316


  df = df.append(df1)


TESTING Myles Hollander AGAINST Victor De Gruttola
Accuracy on test set =  0.5
f1 score =  0.56
-----------------------------------------------------------------
              Author 1            Author 2  Accuracy        F1
0       Han-ying Liang      Ngai Hang Chan  0.545455  0.545455
0          Louise Ryan    Kerrie Mengersen  0.500000  0.437500
0   Jerzy K. Baksalary          Zehua Chen  0.578947  0.600000
0  Enrique Schisterman    Paddy Farrington  0.523810  0.545455
0          Brian Caffo     Lyle Broemeling  0.550000  0.526316
0      Myles Hollander  Victor De Gruttola  0.500000  0.560000
TESTING Paul Gustafson AGAINST Noel Cressie
Accuracy on test set =  0.5
f1 score =  0.4864864864864865
-----------------------------------------------------------------
              Author 1            Author 2  Accuracy        F1
0       Han-ying Liang      Ngai Hang Chan  0.545455  0.545455
0          Louise Ryan    Kerrie Mengersen  0.500000  0.437500
0   Jerzy K. Baksalary          Zehua C

  df = df.append(df1)
  df = df.append(df1)
  df = df.append(df1)


TESTING Robert Kohn AGAINST Paul Janssen
Accuracy on test set =  0.5714285714285714
f1 score =  0.5714285714285714
-----------------------------------------------------------------
              Author 1            Author 2  Accuracy        F1
0       Han-ying Liang      Ngai Hang Chan  0.545455  0.545455
0          Louise Ryan    Kerrie Mengersen  0.500000  0.437500
0   Jerzy K. Baksalary          Zehua Chen  0.578947  0.600000
0  Enrique Schisterman    Paddy Farrington  0.523810  0.545455
0          Brian Caffo     Lyle Broemeling  0.550000  0.526316
0      Myles Hollander  Victor De Gruttola  0.500000  0.560000
0       Paul Gustafson        Noel Cressie  0.500000  0.486486
0         Atanu Biswas    Noël Veraverbeke  0.586207  0.571429
0          Robert Kohn        Paul Janssen  0.571429  0.571429
TESTING David Schoenfeld AGAINST Stuart J. Pocock
Accuracy on test set =  0.6
f1 score =  0.5555555555555556
-----------------------------------------------------------------
              

  df = df.append(df1)
  df = df.append(df1)


In [23]:
hard_pairs

NameError: name 'hard_pairs' is not defined

In [25]:
pvals = 1 - norm.cdf(z_n)
HC = HC(list(pvals))
np.array(vocab)[pvals <= HC]

array(['the', 'of', 'a', 'to', 'and', 'is', 'in', 'for', 'we', 'are',
       'with', 'data', 'model', 'that', 'as', 'models', 'this', 'on',
       'be', 'an', 'spatial', 'by', 'process', 'from', 'using',
       'approach', 'which', 'at', 'bayesian', 'can', 'such',
       'distribution', 'two', 'or', 'random', 'mixture', 'each', 'our',
       'effects', 'it', 'modeling', 'analysis', 'these', 'given',
       'regression', 'used', 'algorithm', 'set', 'hierarchical', 'one',
       'posterior', 'distributions', 'inference', 'sample', 'fitting',
       'not', 'problem', 'species', 'have', 'where', 'level', 'also',
       'when', 'information', 'methods', 'more', 'proposed', 'error',
       'results', 'based', 'observed', 'performance', 'been', 'function',
       'interest', 'number', 'over', 'well', 'class', 'functions',
       'population', 'provide', 'sampling', 'through', 'both', 'case',
       'conditional', 'estimation', 'has', 'illustrate', 'paper',
       'probability', 'test', 'varia

In [162]:
len(author_l)

729