In [5]:
import pandas as pd
import numpy as np
import os

In [6]:
# set up directory paths
RAW_DATA_DIR = '../data/raw/'
PROCESSED_DATA_DIR = '../data/processed/'

aact_data_path = 'aact_folder/aact_data'
author_publications = 'scholarly/authors_published_03.csv'

In [7]:
aact_df = pd.read_csv(os.path.join(RAW_DATA_DIR, aact_data_path))

In [8]:
aact_df.head()

Unnamed: 0,nct_id,first_date,facility_name,city,investigator,investigator_role,sponsor_name,lead_or_collaborator,downcase_name,description
0,NCT06165614,2023-11-21,Lumumba Sub-County Hospital,Kisumu,"Jackton Omoto, MD",Principal Investigator,UNC Lineberger Comprehensive Cancer Center,lead,cervix cancer,"(37275751,NCT06165614,""Despite being preventab..."
1,NCT06165614,2023-11-21,Lumumba Sub-County Hospital,Kisumu,"Jackton Omoto, MD",Principal Investigator,Gilead Sciences,collaborator,cervical precancer,"(37275751,NCT06165614,""Despite being preventab..."
2,NCT06165614,2023-11-21,Lumumba Sub-County Hospital,Kisumu,"Jackton Omoto, MD",Principal Investigator,Gilead Sciences,collaborator,precancerous conditions,"(37275751,NCT06165614,""Despite being preventab..."
3,NCT06165614,2023-11-21,Lumumba Sub-County Hospital,Kisumu,"Jackton Omoto, MD",Principal Investigator,Gilead Sciences,collaborator,cervix neoplasm,"(37275751,NCT06165614,""Despite being preventab..."
4,NCT06165614,2023-11-21,Lumumba Sub-County Hospital,Kisumu,"Jackton Omoto, MD",Principal Investigator,Gilead Sciences,collaborator,cervix cancer,"(37275751,NCT06165614,""Despite being preventab..."


In [4]:
# Retrive investigator details
investigator = aact_df.groupby('investigator')[['nct_id','sponsor_name']].first()
investigator_df = investigator.reset_index()

investigator_df = investigator_df.sort_values(by='nct_id')

In [5]:
## CLEAN INVESTIGATOR COLUMN

# convert to lower case
investigator_df['investigator'] = investigator_df['investigator'].str.lower()

# remove periods
investigator_df['investigator'] = investigator_df['investigator'].str.replace(r'[.,]', '', regex=True)

# reomve titles
title_strings = ['md', 'phd', 'mmed', 'dph', 'md','prof', 'dnb','msc','mph','mbchb','mbbs','frcp']
regex_pattern = '|'.join(title_strings)

investigator_df['investigator']  = investigator_df['investigator'] .str.replace(regex_pattern, '', regex=True).str.strip()

In [4]:
# Save Processed Data

def save_processed_data(data, data_path):
    #create a dir if it does not exist
    path = os.path.join(PROCESSED_DATA_DIR, data_path.split('/')[0])
    if not os.path.exists(path):
        os.makedirs(path)
    saved_path = os.path.join(PROCESSED_DATA_DIR, f'{data_path}.csv')
    data.to_csv(saved_path, index=False)
    
    # Get the absolute path
    abs_path = os.path.abspath(saved_path)

    print("Saved to ✅:", abs_path)

In [5]:
save_processed_data(investigator_df, 'authors/author')

NameError: name 'investigator_df' is not defined

In [4]:
## testing data extraction
from scholarly import scholarly

search = scholarly.search_author('Jackton Omoto')
author = next(search)

# print(author)

authors = scholarly.fill(author)
for publication in authors['publications']:
    print(publication)

{'container_type': 'Publication', 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>, 'bib': {'title': 'Menstrual cups and sanitary pads to reduce school attrition, and sexually transmitted and reproductive tract infections: a cluster randomised controlled feasibility study in …', 'pub_year': '2016', 'citation': 'BMJ open 6 (11), e013229, 2016'}, 'filled': False, 'author_pub_id': 'Gp1EC6cAAAAJ:u5HHmVD_uO8C', 'num_citations': 188, 'citedby_url': 'https://scholar.google.com/scholar?oi=bibs&hl=en&cites=731280057985025706', 'cites_id': ['731280057985025706']}
{'container_type': 'Publication', 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>, 'bib': {'title': "Adolescent schoolgirls' experiences of menstrual cups and pads in rural western Kenya: a qualitative study", 'pub_year': '2015', 'citation': 'Waterlines, 15-30, 2015'}, 'filled': False, 'author_pub_id': 'Gp1EC6cAAAAJ:9yKSN-GCB0IC', 'num_citations': 131, 'citedby_url': '

# Transform Author, Publication and Journal Data

In [6]:
# get author publications data from RAW folder
author_publications_df = pd.read_csv(os.path.join(RAW_DATA_DIR, author_publications))

In [7]:
publications = author_publications_df.loc[:,['investigator','bib','num_citations']]
publications['bib'].head()

0    {'title': 'Guillain-Barré syndrome related to ...
1    {'title': 'The chemotherapy-induced peripheral...
2    {'title': 'Chemotherapy-induced peripheral neu...
3    {'title': 'Clinical pattern and associations o...
4    {'title': 'Understanding the quality of life (...
Name: bib, dtype: object

In [8]:
# extract data from the json format in bib column
import ast
publications['bib'] = publications['bib'] .apply(ast.literal_eval)

info_df = pd.json_normalize(publications['bib'])

publication_details = pd.concat([publications.drop(columns='bib'),info_df], axis=1)

# Create the following tables with the following columns:

- Authors  
*id*  
*name*  
*author_title*  
*instituition*

- Journal  
*journal_name*  
*country*  
*year*  
*impactor_factor*

- Publication  
*publication_title*  
*date*  
*citations*


In [9]:
## extract unique authors from the investigator column
author_list = publication_details['investigator'].unique().tolist()
print(author_list)

['paola alberti', 'kishal lukhna']


In [10]:
## Create authors data frame
Authors = pd.DataFrame({
    'name':author_list,
    'author_title': "nil",
    'institution': "nil"
})

In [31]:
# create journals data frame
## extract journal 
Journals = publication_details.loc[:,['investigator','citation']]
Journals = Journals.assign(country = None, year = None, impact_factor = None)

# rename journal names
Journals.rename(columns={
    'investigator':'author',
    'citations':'journal_name'}, 
    inplace= True)

In [33]:
# remove everything remain with letters. (will have to review this bit though)
Journals['citation'] = Journals['citation'].str.replace(r'[0-9]', '', regex=True)
Journals['citation'] = Journals['citation'].str.replace(r'[.,]', '', regex=True)
Journals['citation'] = Journals['citation'].str.replace(r'[-]', '', regex=True)
Journals['citation'] = Journals['citation'].str.replace(r'[:,\(\)]', '', regex=True)
Journals['citation'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Journals['citation'].replace(r'^\s*$', np.nan, regex=True, inplace=True)


In [35]:
# Fill missing values in the Journals table
Journals.fillna({'citation':'Data_not_available'}, inplace=True)
Journals.fillna({'impact_factor':0}, inplace=True)
Journals.fillna({'year':0}, inplace=True)

In [22]:
Journals['impact_factor'] = Journals['impact_factor'].astype('int64')
Journals['year'] = Journals['year'].astype('int64')


In [19]:
# create publication data frame
Publications = publication_details.loc[:,['investigator','title','pub_year','num_citations']]

# rename Publication names
Publications.rename(columns={
    'investigator':'author',
    'title' : 'publication_title',
    'pub_year':'year',
    'num_citations':'citations'}, 
    inplace= True)

In [20]:
# remove periods
Publications['publication_title'] = Publications['publication_title'].str.replace(r'[0-9]', '', regex=True)
Publications['publication_title'] = Publications['publication_title'].str.replace(r'[.,]', '', regex=True)
Publications['publication_title'] = Publications['publication_title'].str.replace(r'[-]', '', regex=True)


In [39]:
# def truncate_to_five_words(text):
#     words = text.split()
#     return ' '.join(words[:5])

# # Apply this function to the desired column
# # Replace 'your_column' with the name of the column you want to process
# Publications['publication_title'] = Publications['publication_title'].apply(truncate_to_five_words)

In [21]:
Publications.fillna({'year': 0 }, inplace=True)
Publications['year'] = Publications['year'].astype('int64')

Publications['year'] = Publications['year'].astype('int64')
Publications.dtypes

author               object
publication_title    object
year                  int64
citations             int64
dtype: object

In [36]:
## Save to Processed folder
save_processed_data(Authors, 'model_data/authors')
save_processed_data(Publications, 'model_data/publications')
save_processed_data(Journals, 'model_data/journals')

Saved to ✅: c:\Users\Hp\OneDrive\OLDFILES\Documents\PROJECTS\trials_dashboard\data\processed\model_data\authors.csv
Saved to ✅: c:\Users\Hp\OneDrive\OLDFILES\Documents\PROJECTS\trials_dashboard\data\processed\model_data\publications.csv
Saved to ✅: c:\Users\Hp\OneDrive\OLDFILES\Documents\PROJECTS\trials_dashboard\data\processed\model_data\journals.csv


VISUALIZE TABLES

Journals