In [68]:
import pandas as pd
import numpy as np
import sqlite3
import torch

In [3]:
# Load dataset
con = sqlite3.connect('../paper_data.sqlite3')
papers = pd.read_sql_query("SELECT * FROM paper_data", con)

con.close()

In [4]:
# paper.drop(columns=['file_name'], inplace=True)
papers

Unnamed: 0,title,description,year,coverdate,publication_name,citation_count,subject_areas,author_names,affiliations,countries,filename
0,Association between leukocyte telomere length ...,Aim: The aims of this study were to compare le...,2018,2018-01-01,International Journal of Rheumatic Diseases,21,Rheumatology,"Poonpet T.,Saetan N.,Tanavalee A.,Wilairatana ...",Chulalongkorn University,Thailand,201802787.json
1,Molecular insights into inclusion complexes of...,The structural dynamics and stability of inclu...,2018,2018-01-01,Journal of Molecular Graphics and Modelling,18,"Spectroscopy,Physical and Theoretical Chemistr...","Mahalapbutr P.,Nutho B.,Wolschann P.,Chavasiri...","Chulalongkorn University,Universität Wien,Chia...","Austria,Thailand",201802729.json
2,Characterization and enumeration of complement...,Abelian codes and complementary dual codes for...,2018,2018-10-01,Journal of Applied Mathematics and Computing,9,"Computational Mathematics,Applied Mathematics","Boripan A.,Jitman S.,Udomkavanich P.","Chulalongkorn University,Silpakorn University",Thailand,201800578.json
3,An environmental-semantic computing system for...,Environmental-semantic space integration is a ...,2018,2018-01-01,Frontiers in Artificial Intelligence and Appli...,3,Artificial Intelligence,"Kiyoki Y.,Chen X.,Veesommai C.,Sasaki S.,Uraki...","Chulalongkorn University,Keio University,Kanag...","Japan,Thailand",201802213.json
4,"Crystallisation of CH3NH3PbX3 (X = I, Br, and ...","In this work, the effects of halides on the pr...",2018,2018-04-01,Micro and Nano Letters,5,"Bioengineering,Biomedical Engineering,Material...","Loryuenyong V.,Khiaokaeo N.,Koomsin W.,Thongch...","Chulalongkorn University,Silpakorn University",Thailand,201801616.json
...,...,...,...,...,...,...,...,...,...,...,...
20211,Titania nanotube architectures synthesized on ...,The aim of this study is to synthesize Titania...,2021,2021-11-01,Materials,5,"Materials Science (all),Condensed Matter Physics","Chunate H.-T.,Khamwannah J.,Aliyu A.A.A.,Tanta...","Chulalongkorn University,Meticuly Co. Ltd.",Thailand,202100632.json
20212,Effect of molecular weight on mechanical prope...,Molecular weight (MW) is an important factor t...,2021,2021-01-01,Materials Today: Proceedings,9,Materials Science (all),"Promnil S.,Numpaisal P.-O.,Ruksakulpiwat Y.","Suranaree University of Technology,Center of E...",Thailand,202103246.json
20213,Development and validation of HERWIG 7 tunes f...,This paper presents new sets of parameters (“t...,2021,2021-04-01,European Physical Journal C,10,"Engineering (miscellaneous),Physics and Astron...","Sirunyan A.M.,Tumasyan A.,Adam W.,Ambrogi F.,B...",Rheinisch-Westfälische Technische Hochschule A...,"Uzbekistan,Turkey,Taiwan,Poland,Estonia,Belaru...",202102288.json
20214,Peak-dose ballism associated with declining im...,,2021,2021-01-01,Journal of Movement Disorders,0,"Neurology (clinical),Neurology","Jen-Rei D.C.,Thien L.T.,Keong L.H.,Leng H.W.,P...","Hospital Pulau Pinang,Penang Hospital,King Chu...","Thailand,Malaysia",202103378.json


In [10]:
# Search for empty strings in every string column and replace them with NaN
str_cols = papers.select_dtypes(include=['object']).columns

for col in str_cols:
    papers[col] = papers[col].replace('', np.nan)

papers.isna().sum()

title                 1
description         665
year                  0
coverdate             0
publication_name      0
citation_count        0
subject_areas         0
author_names          0
affiliations          0
countries             0
filename              0
dtype: int64

In [17]:
papers.dropna(subset=['title', 'description'], inplace=True)

In [None]:
papers.loc[papers.citation_count == 'None', 'citation_count'] = np.nan
papers['citation_count'] = pd.to_numeric(papers['citation_count'], errors='coerce')

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
papers['citation_count'] = imputer.fit_transform(papers[['citation_count']])

papers['coverdate'] = pd.to_datetime(papers['coverdate'], errors='coerce')


array(['Rheumatology',
       'Spectroscopy,Physical and Theoretical Chemistry,Computer Graphics and Computer-Aided Design,Materials Chemistry',
       'Computational Mathematics,Applied Mathematics', ...,
       'Immunology and Allergy,Immunology,Toxicology,Pharmacology',
       'Computer Networks and Communications,Instrumentation',
       'Dermatology,Obstetrics and Gynecology,Infectious Diseases'],
      shape=(3063,), dtype=object)

In [None]:
subject_areas = papers.subject_areas.str.split(',').explode().str.strip().unique().tolist()

#Get number of authors per paper
papers['author_count'] = papers.author_names.str.split(',').apply(lambda x: len(x) if isinstance(x, list) else 0)
papers['affil_count'] = papers.affiliations.str.split(',').apply(lambda x: len(x) if isinstance(x, list) else 0)
papers['country_count'] = papers.countries.str.split(',').apply(lambda x: len(x) if isinstance(x, list) else 0)



Unnamed: 0,title,description,year,coverdate,publication_name,citation_count,subject_areas,author_names,affiliations,countries,filename,author_counts,author_count,affil_count,country_count
0,Association between leukocyte telomere length ...,Aim: The aims of this study were to compare le...,2018,2018-01-01,International Journal of Rheumatic Diseases,21.0,Rheumatology,"Poonpet T.,Saetan N.,Tanavalee A.,Wilairatana ...",Chulalongkorn University,Thailand,201802787.json,6,6,1,1
1,Molecular insights into inclusion complexes of...,The structural dynamics and stability of inclu...,2018,2018-01-01,Journal of Molecular Graphics and Modelling,18.0,"Spectroscopy,Physical and Theoretical Chemistr...","Mahalapbutr P.,Nutho B.,Wolschann P.,Chavasiri...","Chulalongkorn University,Universität Wien,Chia...","Austria,Thailand",201802729.json,6,6,3,2
2,Characterization and enumeration of complement...,Abelian codes and complementary dual codes for...,2018,2018-10-01,Journal of Applied Mathematics and Computing,9.0,"Computational Mathematics,Applied Mathematics","Boripan A.,Jitman S.,Udomkavanich P.","Chulalongkorn University,Silpakorn University",Thailand,201800578.json,3,3,2,1
3,An environmental-semantic computing system for...,Environmental-semantic space integration is a ...,2018,2018-01-01,Frontiers in Artificial Intelligence and Appli...,3.0,Artificial Intelligence,"Kiyoki Y.,Chen X.,Veesommai C.,Sasaki S.,Uraki...","Chulalongkorn University,Keio University,Kanag...","Japan,Thailand",201802213.json,8,8,3,2
4,"Crystallisation of CH3NH3PbX3 (X = I, Br, and ...","In this work, the effects of halides on the pr...",2018,2018-04-01,Micro and Nano Letters,5.0,"Bioengineering,Biomedical Engineering,Material...","Loryuenyong V.,Khiaokaeo N.,Koomsin W.,Thongch...","Chulalongkorn University,Silpakorn University",Thailand,201801616.json,5,5,2,1
