In [18]:
import pandas as pd
import numpy as np 
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import HTML
import re

In [19]:
!ls

Collaboration_trends.ipynb            data-import-NCBI.ipynb
Data_Analysis_and_Visualization.ipynb data-import-visualization.ipynb
Data_Processing.ipynb                 test2_file.txt
data-import-NCBI-entrez.ipynb


In [20]:
!cat test2_file.txt | head -10

Id	ArticleIds	AuthorList	DOI	EPubDate	FullJournalName	HasAbstract	LastAuthor	NlmUniqueID	PubDate	PubTypeList	RecordStatus	Source	Title
31734938	DictElement({'pubmed': ['31734938'], 'medline': [], 'doi': '10.1007/s13555-019-00342-1', 'pii': '10.1007/s13555-019-00342-1', 'rid': '31734938', 'eid': '31734938'}, attributes={})	['Ly K', 'Chang AY', 'Kiprono SK', 'Jose M', 'Smith MP', 'Beck K', 'Sekhon S', 'Muraguri I', 'Mungai M', 'Coates SJ', 'Thibodeaux Q', 'Hulse S', 'Gualberto M', 'Jeon C', 'Nakamura M', 'Bhutani T', 'Maurer T', 'Liao W']	10.1007/s13555-019-00342-1	2019 Nov 16	Dermatology and therapy	1	Liao W	101590450	2019 Nov 16	['Journal Article']	PubMed - as supplied by publisher	Dermatol Ther (Heidelb)	Implementation of an Ultraviolet Phototherapy Service at a National Referral Hospital in Western Kenya: Reflections on Challenges and Lessons Learned.
31734704	DictElement({'pubmed': ['31734704'], 'medline': [], 'pii': '5627697', 'doi': '10.1093/cid/ciz1104', 'rid': '31734704', 'eid

In [21]:
!pwd

/Users/aoi-rain/Desktop/Data-mining/Notebooks


In [22]:
# import a tab separated file
papers = pd.read_csv("/Users/aoi-rain/Desktop/Data-mining/Data/kenyan_papers_details.txt", sep = "\t")

In [23]:
#profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})

In [24]:
#profile.to_widgets()

In [25]:
# See all the columns in the dataframe 
papers.columns

Index(['Id', 'AuthorList', 'DOI', 'EPubDate', 'FullJournalName', 'HasAbstract',
       'LastAuthor', 'NlmUniqueID', 'PubDate', 'PubTypeList', 'RecordStatus',
       'Source', 'Title'],
      dtype='object')

In [26]:
# See the first 20 items in the column author list
papers.AuthorList[:20]

0                            ['Achieng L', 'Riedel DJ']
1     ['Letizia A', 'Eller MA', 'Polyak C', 'Eller L...
2     ['Lalani T', 'Tisdale MD', 'Liu J', 'Mitra I',...
3     ['Ayieko J', 'Brown L', 'Anthierens S', 'Van R...
4     ['Golicha Q', 'Shetty S', 'Nasiblov O', 'Husse...
5     ['Marshall F', 'Reid REB', 'Goldstein S', 'Sto...
6     ['Ahlberg S', 'Grace D', 'Kiarie G', 'Kirino Y...
7     ['Afulani PA', 'Diamond-Smith N', 'Phillips B'...
8     ['Bitta MA', 'Bakolis I', 'Kariuki SM', 'Nyutu...
9     ['Odero I', "Ondeng'e K", 'Mudhune V', 'Okola ...
10    ['Akombi BJ', 'Ghimire PR', 'Agho KE', 'Renzah...
11    ["O'Brien TG", 'Kinnaird MF', 'Ekwanga S', 'Wi...
12                                                   []
13            ['Muchene KW', 'Mageto IG', 'Cheptum JJ']
14    ['Ogali IN', 'Wamuyu LW', 'Lichoti JK', 'Mungu...
15                 ['Chiyo PI', 'Obanda V', 'Korir DK']
16         ['Barreiro G', 'Mellin-Olsen J', 'Litswa L']
17    ['Kumar N', 'Gupta R', 'Sayed S', 'Moloo Z

In [27]:
# remove any weird characters in the column for author 
remove_quote = papers.AuthorList.str.replace("'", "")
remove_brac = remove_quote.str.replace("[", "")
remove_close = remove_brac.str.replace("]", "")

# convert it to a list for other steps 
author_collab = list(remove_close)

In [28]:
collab = author_collab.__len__()

How this method works: The Term frequency inverse document frequency(Tf-idf) works by finding the 

$$ term frequency/length of document$$ 

It can answer the question what's the probability of observing a piece of text. If the probability is close to one then the term is frequent and the next step could be to look into the author name's institution in the paper they collaborated writing thus finding collaboration trends and inter-institutional collaborations.

In [29]:
# Initialize TFIdf model with min_df set to 1
vectorizer = TfidfVectorizer(min_df=1)

# fit and transform the data
model = vectorizer.fit_transform(remove_close)

# return a numpy matrix with spaces filled with zeros done by the .todense method
# change all our numbers to 2 dp
print(model.todense().round(2))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
# find the mean across each nested array and convert it to a list
weights = np.asarray(model.mean(axis=0)).ravel().tolist()

# Take the output of the Tfidf vectorizer and the weights and turn it into a dataframe
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'weight': weights})

# take the weights_df and sort by weight in descending order and return the first 20 values
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
4428,jm,0.008752
4303,ja,0.007905
10565,van,0.007765
2194,de,0.007548
9715,sm,0.007435
380,am,0.007192
2542,ea,0.006304
10827,wang,0.005687
5799,ma,0.005606
2656,em,0.004871


In [31]:
# see all the output of the rows of the weight df column
HTML(weights_df.sort_values(by='weight', ascending=False).to_html())

Unnamed: 0,term,weight
4428,jm,0.008752
4303,ja,0.007905
10565,van,0.007765
2194,de,0.007548
9715,sm,0.007435
380,am,0.007192
2542,ea,0.006304
10827,wang,0.005687
5799,ma,0.005606
2656,em,0.004871


In [49]:
# subset list to contain what's in the string
result = filter(lambda x: "Wambua J" in x , author_collab)
print(list(result))

['Gichangi A, Wambua J, Mutwiwa S, Njogu R, Bazant E, Wamicwe J, Wafula R, Vrana CJ, Stevens DR, Mudany M, Korte JE', 'Brent AJ, Nyundo C, Langat J, Mulunda C, Wambua J, Bauni E, Sande J, Park K, Williams TN, Newton CRJ, Levin M, Scott JAG, KIDS TB Study Group.', 'Payne RO, Silk SE, Elias SC, Miura K, Diouf A, Galaway F, de Graaf H, Brendish NJ, Poulton ID, Griffiths OJ, Edwards NJ, Jin J, Labbé GM, Alanine DG, Siani L, Di Marco S, Roberts R, Green N, Berrie E, Ishizuka AS, Nielsen CM, Bardelli M, Partey FD, Ofori MF, Barfod L, Wambua J, Murungi LM, Osier FH, Biswas S, McCarthy JS, Minassian AM, Ashfield R, Viebig NK, Nugent FL, Douglas AD, Vekemans J, Wright GJ, Faust SN, Hill AV, Long CA, Lawrie AM, Draper SJ', 'Mogeni P, Williams TN, Omedo I, Kimani D, Ngoi JM, Mwacharo J, Morter R, Nyundo C, Wambua J, Nyangweso G, Kapulu M, Fegan G, Bejon P']


In [46]:
# confirmation of the word
# aim is to get to index 9162 sa 0.003904
author_collab

['Achieng L, Riedel DJ',
 'Letizia A, Eller MA, Polyak C, Eller LA, Creegan M, Dawson P, Bryant C, Dohoon K, Crowell TA, Lombardi K, Rono E, Robb ML, Michael NL, Maswai J, Ake JA',
 'Lalani T, Tisdale MD, Liu J, Mitra I, Philip C, Odundo E, Reyes F, Simons MP, Fraser JA, Hutley E, Connor P, Swierczewski BE, Houpt E, Tribble DR, Riddle MS',
 'Ayieko J, Brown L, Anthierens S, Van Rie A, Getahun M, Charlebois ED, Petersen ML, Clark TD, Kamya MR, Cohen CR, Bukusi EA, Havlir DV, Camlin CS',
 'Golicha Q, Shetty S, Nasiblov O, Hussein A, Wainaina E, Obonyo M, Macharia D, Musyoka RN, Abdille H, Ope M, Joseph R, Kabugi W, Kiogora J, Said M, Boru W, Galgalo T, Lowther SA, Juma B, Mugoh R, Wamola N, Onyango C, Gura Z, Widdowson MA, DeCock KM, Burton JW',
 'Marshall F, Reid REB, Goldstein S, Storozum M, Wreschnig A, Hu L, Kiura P, Shahack-Gross R, Ambrose SH',
 'Ahlberg S, Grace D, Kiarie G, Kirino Y, Lindahl J',
 'Afulani PA, Diamond-Smith N, Phillips B, Singhal S, Sudhinaraset M',
 'Bitta MA, Ba

# Common or collaborative authors
Jm: Castellote JM

Ja: Rajasekaran S, Hajjaj-Hassouni N, Berkley JA, Scott JAG, Mott JA, Otieno JA

van: Van der Horst, Vanlauwe B, van Baar A, van der Straten A

de: Haldeman S, [Mogunde J & Burmen BK], Deino AL

sm: Mesman J, Behrensmeyer AK

am: Noor AM, Wambua J