In [1]:
import numpy as np
import pandas as pd
from os import getcwd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import os
import pickle
from collections import Counter
from sklearn import  naive_bayes, pipeline
from sklearn.metrics import accuracy_score

In [2]:

def set_path():
    abspath = getcwd()
    dname = os.path.dirname(abspath)
    os.chdir(dname)
set_path()

In [3]:
df_train = pd.read_pickle('data/processed/train_clean_df.pkl')
df_val = pd.read_pickle('data/processed/val_clean_df.pkl')


#### Bag of words:

Get x common words out of each author's corpus

In [4]:
# checking number of unique words in each author's corpus 
def get_author_unique_corpus(df):
   listauthor = df['authId_enc'].unique()
   corpus_auth = {}
   for au in listauthor:
      totalword = []
      auth_content = df.loc[df['authId_enc'] == au, 'content']
      for art in auth_content:
         wordlist = art.split()
         totalword += wordlist
      corpus_auth[au] = list(set(totalword))
   return corpus_auth
corpus_by_author = get_author_unique_corpus(df_train)

num_corpus = {k:len(v) for k, v in corpus_by_author.items() }
max_uniq_w = max(num_corpus, key = num_corpus.get)
print(f'author {max_uniq_w} has maximum unique words: {num_corpus[max_uniq_w]}' )
min_uniq_w = min(num_corpus, key = num_corpus.get)
print(f'author {min_uniq_w} has maximum unique words: {num_corpus[min_uniq_w]}' )
print(f'average number of unique words per author is {np.mean(list(num_corpus.values()))}' )




author 198 has maximum unique words: 465
author 1407 has maximum unique words: 12
average number of unique words per author is 91.06595555555556


In [5]:
df_train.head()

Unnamed: 0,content,authId_enc
0,factual effici integr relev fact visual questi...,4134
1,limitbert linguist inform multitask bert paper...,2444
2,dataeffici languag shape fewshot imag classif ...,4533
3,querydriven topic model topic model unsupervis...,5178
4,extract abstract explan factcheck evalu news p...,4884


In [6]:
Xtrain = df_train['content']
ytrain = df_train['authId_enc'].values
vectorizer = TfidfVectorizer(max_df=0.5,
                              max_features=3000,
                              norm=None)
classifierNB = naive_bayes.MultinomialNB(fit_prior=False,alpha=0.5)
modelNB_3000= pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])

modelNB_3000.fit(Xtrain, ytrain)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_df=0.5, max_features=3000, norm=None)),
                ('classifier', MultinomialNB(alpha=0.5, fit_prior=False))])

In [7]:
Xval = df_val['content'].values
yval= df_val['authId_enc'].values
predicted = modelNB_3000.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy

0.17332549941245592

##### Increasing the size of corpus

In [8]:
Xtrain = df_train['content']
ytrain = df_train['authId_enc'].values
vectorizer = TfidfVectorizer(max_df=0.5,
                              max_features=4000,
                              norm=None)
classifierNB = naive_bayes.MultinomialNB(fit_prior=False,alpha=0.5)
modelNB_4000= pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])

modelNB_4000.fit(Xtrain, ytrain)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_df=0.5, max_features=4000, norm=None)),
                ('classifier', MultinomialNB(alpha=0.5, fit_prior=False))])

In [9]:
Xval = df_val['content'].values
yval= df_val['authId_enc'].values
predicted = modelNB_4000.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy

0.17508813160987075

##### EXTENDING CORPUS

In [10]:
Xtrain = df_train['content']
ytrain = df_train['authId_enc'].values
vectorizer = TfidfVectorizer(max_df=0.5,
                              max_features=5000,
                              norm=None)
classifierNB = naive_bayes.MultinomialNB(fit_prior=False,alpha=0.5)
modelNB_5000= pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])

modelNB_5000.fit(Xtrain, ytrain)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_df=0.5, max_features=5000, norm=None)),
                ('classifier', MultinomialNB(alpha=0.5, fit_prior=False))])

In [11]:
Xval = df_val['content'].values
yval= df_val['authId_enc'].values
predicted = modelNB_5000.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy

0.17391304347826086

#### STACKING DATA FOR REFIT CHOSEN MODEL

In [12]:
df_combined = pd.concat([df_train, df_val], ignore_index=True, axis=0)
df_combined.shape


(12119, 2)

In [13]:
Xcombined = df_combined['content'].values
ycombined = df_combined['authId_enc'].values
modelNB_4000.fit(Xcombined, ycombined)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_df=0.5, max_features=4000, norm=None)),
                ('classifier', MultinomialNB(alpha=0.5, fit_prior=False))])

In [14]:
df_test = pd.read_pickle('data/processed/test_clean_df.pkl')

# get lable encoder of author
with open('code/authorIdlabel.pkl', 'rb') as f:
   authorId_encoder = pickle.load(file = f)


In [15]:
Xtest = df_test['content'].values
predicted = modelNB_4000.predict(Xtest)
predictauthorId = authorId_encoder.inverse_transform(predicted)
len(predictauthorId)


6531

In [16]:
predictauthorId = list(map(str, predictauthorId))
df_test['authorId']= predictauthorId
df_test = df_test[['paperId', 'authorId']].copy()
df_test.head()

Unnamed: 0,paperId,authorId
0,86e1aaa0c47659e08a896e9889384eb1e5401e6a,1821892
1,8d3076c38f56df22052567f4783c670d8e860f09,1916865
2,7c400ee676d427eeda1aad5c1c54c316f0b9773d,1390037280
3,185e7d2a761594451b02ace240356dadad2aef78,51518773
4,e4363d077a890c8d5c5e66b82fe69a1bbbdd5c80,49889487


In [17]:
df_test.to_json('data/processed/predicted_full.json', orient="records")

In [6]:
# feature selection by chi2
Xnames = vectorizer.get_feature_names_out() ## original full corpus
p_value_limit = 0.95
df_features = pd.DataFrame()
for author in np.unique(df_train['authId_enc']):
   X= Xtrain.toarray()
   Y = df_train['authId_enc'] ==author
   chi, p = chi2(X, Y)
   df_features = pd.concat([df_features, 
   pd.DataFrame({'feature': Xnames, 'score' : 1-p, 'authId_enc': author})])

   df_features = df_features[df_features['score'] > p_value_limit]
df_features = df_features.sort_values(['authId_enc', 'score'],
                                          ascending = [True, False])
Xnames = df_features['feature'].unique().tolist() # selected words for new corpus from feature selection

In [None]:
# PRINT out selected feature by author
for author in np.unique(df_train['authId_enc'])[:10]:
   print("# {}:".format(author))
   print("  . selected features:",
         len(df_features[df_features["authId_enc"]==author]))
   print("  . top features:", ",".join(
df_features[df_features["authId_enc"]==author]["feature"].values[:10]))
   print(" ")

# 12:
  . selected features: 11
  . top features: pars,grammar,algorithm,contextfre grammar,contextfre,tabular,pars algorithm,tabular pars,prefix,argument
 
# 241:
  . selected features: 12
  . top features: vector,crosslingu,vector space,space,word vector,specialis,lexic,monolingu,lexic entail,postprocess
 
# 257:
  . selected features: 4
  . top features: morpholog,segment,morpholog segment,morpholog tag
 
# 1307:
  . selected features: 10
  . top features: nli,nativ languag,ensembl,languag identif,nativ,hash,classifi,crosscorpus,identif,metaclassifi
 
# 1776:
  . selected features: 7
  . top features: lowresourc,encoderdecod,paradigm,characterbas,set,develop set,morpholog reinflect
 
# 2204:
  . selected features: 13
  . top features: news,user,metaphor,recommend,behavior,news recommend,emoji,multihead,multihead selfattent,selfattent
 
# 2905:
  . selected features: 10
  . top features: convolut,paraphras,paraphras identif,phrase,minibatch,0shottc,chunk,multigrancnn,convolut neural,

In [7]:
len(Xnames)

4985

### Fit vectorizer in the new corpus

In [8]:
vectorizer = TfidfVectorizer(vocabulary=Xnames)
vectorizer.fit(corpus)
Xtrain_new = vectorizer.transform(corpus)
ytrain = df_train['authId_enc'].values
yval= df_val['authId_enc'].values

In [9]:
classifierNB = naive_bayes.MultinomialNB()
## pipeline
modelNB = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])
## train classifier
modelNB["classifier"].fit(Xtrain_new, ytrain)
## test
Xval = df_val['content'].values
predicted = modelNB.predict(Xval)
predicted_prob = modelNB.predict_proba(Xval)

In [11]:
accuracy = accuracy_score(yval, predicted)


In [12]:
accuracy

0.022