## In this file:
- Using data of 1702 frequent authors (having 3 or more than 3 papers in the data sets)
- Model: NB


In [1]:
import numpy as np
import pandas as pd
from os import getcwd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import os
import pickle
from collections import Counter
from sklearn import  naive_bayes, pipeline
from sklearn.metrics import accuracy_score

In [2]:

def set_path():
    abspath = getcwd()
    dname = os.path.dirname(abspath)
    os.chdir(dname)
set_path()

In [3]:
df_train = pd.read_pickle('data/processed/train_3pp_df.pkl')
df_val = pd.read_pickle('data/processed/val_3pp_df.pkl')
df_train.shape, df_val.shape


((5984, 2), (500, 2))

#### Bag of words:

Get x common words out of each author's corpus

In [4]:
# checking number of unique words in each author's corpus 
def get_author_unique_corpus(df):
   listauthor = df['authId_enc'].unique()
   corpus_auth = {}
   for au in listauthor:
      totalword = []
      auth_content = df.loc[df['authId_enc'] == au, 'content']
      for art in auth_content:
         wordlist = art.split()
         totalword += wordlist
      corpus_auth[au] = list(set(totalword))
   return corpus_auth
corpus_by_author = get_author_unique_corpus(df_train)

num_corpus = {k:len(v) for k, v in corpus_by_author.items() }
max_uniq_w = max(num_corpus, key = num_corpus.get)
print(f'author {max_uniq_w} has maximum unique words: {num_corpus[max_uniq_w]}' )
min_uniq_w = min(num_corpus, key = num_corpus.get)
print(f'author {min_uniq_w} has maximum unique words: {num_corpus[min_uniq_w]}' )
print(f'average number of unique words per author is {np.mean(list(num_corpus.values()))}' )




author 4261 has maximum unique words: 479
author 849 has maximum unique words: 29
average number of unique words per author is 152.2855464159812


### Feature engineering and selection with Tfidf


In [5]:

vectorizer = TfidfVectorizer(max_features=3000, )
corpus = df_train['content']
vectorizer.fit(corpus)
Xtrain = vectorizer.transform(corpus)


In [6]:
# feature selection by chi2
Xnames = vectorizer.get_feature_names_out() ## original full corpus
p_value_limit = 0.95
df_features = pd.DataFrame()
for author in np.unique(df_train['authId_enc']):
   X= Xtrain.toarray()
   Y = df_train['authId_enc'] ==author
   chi, p = chi2(X, Y)
   df_features = pd.concat([df_features, 
   pd.DataFrame({'feature': Xnames, 'score' : 1-p, 'authId_enc': author})])

   df_features = df_features[df_features['score'] > p_value_limit]
df_features = df_features.sort_values(['authId_enc', 'score'],
                                          ascending = [True, False])
Xnames = df_features['feature'].unique().tolist() # selected words for new corpus from feature selection

In [7]:
count_uniq_w_author = Counter(df_features["authId_enc"])
min_auth = min(count_uniq_w_author, key = count_uniq_w_author.get)
print(f'author {min_auth} has the least unique chosen words {count_uniq_w_author[min_auth]} ')

author 904 has the least unique chosen words 9 


In [8]:
# PRINT out selected feature by author
for author in np.unique(df_train['authId_enc'])[:10]:
   print("# {}:".format(author))
   print("  . selected features:",
         len(df_features[df_features["authId_enc"]==author]))
   print("  . top features:", ",".join(
df_features[df_features["authId_enc"]==author]["feature"].values[:10]))
   print(" ")     

# 0:
  . selected features: 26
  . top features: instanti,situat,distanc,googl,anaphor,parametr,bridg,violat,largest,pronomin
 
# 1:
  . selected features: 21
  . top features: monitor,environ,familiar,rise,finnish,morpholog,thank,rich,modif,inflect
 
# 3:
  . selected features: 27
  . top features: cohes,electron,themat,thesaurus,criterion,index,late,dictionari,colloc,brought
 
# 5:
  . selected features: 41
  . top features: agenda,selfreport,discoveri,diagnosi,ptsd,impract,clpsych,englishfrench,semiautomat,quot
 
# 9:
  . selected features: 31
  . top features: framesemant,meme,fake,timelin,texton,first,lowrank,probabilist,arc,reinforc
 
# 11:
  . selected features: 32
  . top features: explain,spectrum,expos,arguabl,proofofconcept,burden,countbas,diagnos,end,expens
 
# 12:
  . selected features: 34
  . top features: contextfre,tabular,deduct,bilex,grammar,latentvari,prefix,algorithm,pars,termin
 
# 14:
  . selected features: 29
  . top features: download,nonparallel,mix,queri,secon

In [9]:
len(Xnames)

2964

### Fit vectorizer in the new corpus

In [10]:
vectorizer = TfidfVectorizer(vocabulary=Xnames)
vectorizer.fit(corpus)
Xtrain_new = vectorizer.transform(corpus)
ytrain = df_train['authId_enc'].values
yval= df_val['authId_enc'].values

In [11]:
classifierNB = naive_bayes.MultinomialNB()
## pipeline
modelNB = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])
## train classifier
modelNB["classifier"].fit(Xtrain_new, ytrain)
## test
Xval = df_val['content'].values
predicted = modelNB.predict(Xval)
predicted_prob = modelNB.predict_proba(Xval)

In [12]:
accuracy = accuracy_score(yval, predicted)


In [13]:
accuracy

0.01

### Fine tune MultiNomial NB model with Gridsearch

In [14]:
Xtrain = df_train['content']
ytrain = df_train['authId_enc'].values
vectorizer = TfidfVectorizer()
classifierNB = naive_bayes.MultinomialNB()
modelNB = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])

In [15]:
from sklearn.model_selection import GridSearchCV
grid_params = {
  'classifier__alpha': np.linspace(0.5, 1.5, 3),
  'classifier__fit_prior': [True, False],
  'vectorizer__max_df': [0.1, 0.5, 1],
  'vectorizer__binary': [True, False],
  'vectorizer__norm': [None, 'l1', 'l2'], 
  'vectorizer__max_features': [1000, 1500, 2000, 2500, 3000],
}
clf = GridSearchCV(modelNB, grid_params, cv = 2, scoring='accuracy')
clf.fit(Xtrain, ytrain)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('classifier', MultinomialNB())]),
             param_grid={'classifier__alpha': array([0.5, 1. , 1.5]),
                         'classifier__fit_prior': [True, False],
                         'vectorizer__binary': [True, False],
                         'vectorizer__max_df': [0.1, 0.5, 1],
                         'vectorizer__max_features': [1000, 1500, 2000, 2500,
                                                      3000],
                         'vectorizer__norm': [None, 'l1', 'l2']},
             scoring='accuracy')

In [16]:
clf.best_params_, clf.best_score_

({'classifier__alpha': 0.5,
  'classifier__fit_prior': False,
  'vectorizer__binary': False,
  'vectorizer__max_df': 0.5,
  'vectorizer__max_features': 3000,
  'vectorizer__norm': None},
 0.1602606951871658)

In [17]:
Xval = df_val['content'].values
predicted = clf.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy

0.214

In [18]:
# Trail with max_feature = 2500
Xtrain = df_train['content']
ytrain = df_train['authId_enc'].values
vectorizer = TfidfVectorizer()
classifierNB = naive_bayes.MultinomialNB()
modelNB = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])
from sklearn.model_selection import GridSearchCV
grid_params = {
  'classifier__alpha': np.linspace(0.5, 1.5, 3),
  'classifier__fit_prior': [True, False],
  'vectorizer__max_df': [0.1, 0.5, 1],
  'vectorizer__binary': [True, False],
  'vectorizer__norm': [None, 'l1', 'l2'], 
  'vectorizer__max_features': [1000, 1500, 2000, 2500,]
}
clf = GridSearchCV(modelNB, grid_params, cv = 2, scoring='accuracy')
clf.fit(Xtrain, ytrain)
print('Best parameter\n', clf.best_params_,'\nBest score\n', clf.best_score_)
Xval = df_val['content'].values
predicted = clf.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy


Best parameter
 {'classifier__alpha': 0.5, 'classifier__fit_prior': False, 'vectorizer__binary': False, 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2500, 'vectorizer__norm': None} 
Best score
 0.15925802139037432


0.208

In [19]:
# Trail with max_feature = 3500
Xtrain = df_train['content']
ytrain = df_train['authId_enc'].values
vectorizer = TfidfVectorizer()
classifierNB = naive_bayes.MultinomialNB()
modelNB = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])
from sklearn.model_selection import GridSearchCV
grid_params = {
  'classifier__alpha': np.linspace(0.5, 1.5, 3),
  'classifier__fit_prior': [True, False],
  'vectorizer__max_df': [0.1, 0.5, 1],
  'vectorizer__binary': [True, False],
  'vectorizer__norm': [None, 'l1', 'l2'], 
  'vectorizer__max_features': [2500,3000, 3500]
}
clf = GridSearchCV(modelNB, grid_params, cv = 2, scoring='accuracy')
clf.fit(Xtrain, ytrain)




GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('classifier', MultinomialNB())]),
             param_grid={'classifier__alpha': array([0.5, 1. , 1.5]),
                         'classifier__fit_prior': [True, False],
                         'vectorizer__binary': [True, False],
                         'vectorizer__max_df': [0.1, 0.5, 1],
                         'vectorizer__max_features': [2500, 3000, 3500],
                         'vectorizer__norm': [None, 'l1', 'l2']},
             scoring='accuracy')

In [30]:
print('Best parameter')
clf.best_params_

Best parameter


{'classifier__alpha': 0.5,
 'classifier__fit_prior': False,
 'vectorizer__binary': False,
 'vectorizer__max_df': 0.5,
 'vectorizer__max_features': 3500,
 'vectorizer__norm': None}

In [21]:
print('Best score')
clf.best_score_

Best score


0.16059491978609625

In [22]:

Xval = df_val['content'].values
predicted = clf.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy

0.202

#### chosen model 

In [23]:
df_combined = pd.concat([df_train, df_val], ignore_index=True, axis=0)
df_combined.shape


(6484, 2)

In [24]:
Xcombined = df_combined['content']
ycombined = df_combined['authId_enc'].values
vectorizer = TfidfVectorizer(max_df=0.5,
                              max_features=3000,
                              norm=None)
classifierNB = naive_bayes.MultinomialNB(fit_prior=False,alpha=0.5)
modelNB_final= pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])

modelNB_final.fit(Xcombined, ycombined)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_df=0.5, max_features=3000, norm=None)),
                ('classifier', MultinomialNB(alpha=0.5, fit_prior=False))])

In [25]:
df_test = pd.read_pickle('data/processed/test_clean_df.pkl')

# get lable encoder of author
with open('code/authorIdlabel_3papers.pkl', 'rb') as f:
   authorId_encoder = pickle.load(file = f)


In [26]:
Xtest = df_test['content'].values
predicted = modelNB_final.predict(Xtest)
predictauthorId = authorId_encoder.inverse_transform(predicted)
len(predictauthorId)


6531

In [27]:
predictauthorId

array([   1821892,   33464127, 1390037280, ...,    2814303,    1763912,
         40895369], dtype=int64)

array([  33524946,   33464127, 1390037280, ...,    2814303,    1763912,
         67284811], dtype=int64)

In [28]:
predictauthorId = list(map(str, predictauthorId))
df_test['authorId']= predictauthorId
df_test = df_test[['paperId', 'authorId']].copy()
df_test.head()

Unnamed: 0,paperId,authorId
0,86e1aaa0c47659e08a896e9889384eb1e5401e6a,1821892
1,8d3076c38f56df22052567f4783c670d8e860f09,33464127
2,7c400ee676d427eeda1aad5c1c54c316f0b9773d,1390037280
3,185e7d2a761594451b02ace240356dadad2aef78,3017695
4,e4363d077a890c8d5c5e66b82fe69a1bbbdd5c80,49889487


In [29]:
df_test.to_json('data/processed/predicted.json', orient="records")

In [31]:
# Testing final model 
Xtrain, Xval = df_train['content'].values, df_val['content'].values
ytrain, yval = df_train['authId_enc'].values, df_val['authId_enc'].values

vectorizer = TfidfVectorizer(max_df=0.5,
                              max_features=3000,
                              norm=None)
classifierNB = naive_bayes.MultinomialNB(fit_prior=False,alpha=0.5)
modelNB_final= pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifierNB)])

modelNB_final.fit(Xtrain, ytrain)


predicted = modelNB_final.predict(Xval)
accuracy = accuracy_score(yval, predicted)
accuracy

0.214