## Data import and prep

This loads arxiv data and preps a dataframe for analysis by arxiv-network-analysis.ipynb


In [None]:
import pandas as pd 
import numpy as np 
from datetime import datetime

import json
import dask.bag as db

from itertools import combinations

In [None]:
category_list=['cond-mat'] ## Condense matter physics
records=db.read_text("data/arxiv-metadata-oai-snapshot.json").map(lambda x:json.loads(x))
docs = (records.filter(lambda x:any(ele in x['categories'] for ele in category_list)==True))
get_metadata = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'],
                  'abstract':x['abstract'],
                 'version':x['versions'][-1]['created'],
                         'doi':x["doi"],
                         'authors_parsed':x['authors_parsed']}

data=docs.map(get_metadata).to_dataframe().compute()

In [None]:
# Concatenate the author first and last names
data['num_authors']=data['authors_parsed'].apply(lambda x:len(x))

data['authors']=data['authors_parsed'].apply(lambda authors:[(" ".join(author)).strip() for author in authors])
data.head()

Unnamed: 0,id,title,category,abstract,version,doi,authors_parsed,num_authors,authors
0,704.0006,Bosonic characters of atomic Cooper pairs acro...,cond-mat.mes-hall,We study the two-particle wave function of p...,"Sat, 31 Mar 2007 04:24:59 GMT",10.1103/PhysRevA.75.043613,"[[Pong, Y. H., ], [Law, C. K., ]]",2,"[Pong Y. H., Law C. K.]"
1,704.0008,Numerical solution of shock and ramp compressi...,cond-mat.mtrl-sci,A general formulation was developed to repre...,"Tue, 1 Jul 2008 18:54:28 GMT",10.1063/1.2975338,"[[Swift, Damian C., ]]",1,[Swift Damian C.]
2,704.0025,Spectroscopic Properties of Polarons in Strong...,cond-mat.str-el cond-mat.stat-mech,We present recent advances in understanding ...,"Mon, 2 Apr 2007 12:02:36 GMT",10.1007/978-1-4020-6348-0_12,"[[Mishchenko, A. S., , 1 and 2], [Nagaosa, N.,...",2,"[Mishchenko A. S. 1 and 2, Nagaosa N. 1 and 3]"
3,704.0027,Filling-Factor-Dependent Magnetophonon Resonan...,cond-mat.mes-hall,We describe a peculiar fine structure acquir...,"Thu, 24 Sep 2009 12:40:18 GMT",10.1103/PhysRevLett.99.087402,"[[Goerbig, M. O., ], [Fuchs, J. -N., ], [Keche...",4,"[Goerbig M. O., Fuchs J. -N., Kechedzhi K., Fa..."
4,704.003,Tuning correlation effects with electron-phono...,cond-mat.str-el,We investigate the effect of tuning the phon...,"Sat, 31 Mar 2007 14:14:18 GMT",10.1007/s10909-005-6013-6,"[[Hague, J. P., ], [d'Ambrumenil, N., ]]",2,"[Hague J. P., d'Ambrumenil N.]"


In [None]:
data['DateTime']=pd.to_datetime(data['version'])
data['Day'] = data['DateTime'].dt.day
data['Month'] = data['DateTime'].dt.month
data['Year'] = data['DateTime'].dt.year

In [None]:
# Filter data (2020 to 2021)

data = data[data['Year'].between(2020, 2021)]

In [None]:
data['author_pairs']=data['authors'].apply(lambda x:list(combinations(x, 2)))
data.head() 

Unnamed: 0,id,title,category,abstract,version,doi,authors_parsed,num_authors,authors,DateTime,Day,Month,Year,author_pairs
4021,708.2697,Epsilon Expansion for Multicritical Fixed Poin...,hep-th cond-mat.other,The Polchinski version of the exact renormal...,"Fri, 29 May 2020 14:30:46 GMT",10.1016/j.aop.2007.10.005,"[[O'Dwyer, J., ], [Osborn, H., ]]",2,"[O'Dwyer J., Osborn H.]",2020-05-29 14:30:46+00:00,29,5,2020,"[(O'Dwyer J., Osborn H.)]"
5434,710.1214,Shot-noise of quantum chaotic systems in the c...,cond-mat.mes-hall nlin.CD,Semiclassical methods can now explain many m...,"Fri, 18 Dec 2020 13:17:37 GMT",10.1117/12.724670,"[[Whitney, Robert S., ]]",1,[Whitney Robert S.],2020-12-18 13:17:37+00:00,18,12,2020,[]
6316,711.0637,Symmetry breaking and quantum correlations in ...,cond-mat.mes-hall cond-mat.str-el nucl-th phys...,Investigations of emergent symmetry breaking...,"Wed, 22 Dec 2021 21:51:53 GMT",10.1088/0034-4885/70/12/R02,"[[Yannouleas, Constantine, ], [Landman, Uzi, ]]",2,"[Yannouleas Constantine, Landman Uzi]",2021-12-22 21:51:53+00:00,22,12,2021,"[(Yannouleas Constantine, Landman Uzi)]"
7807,712.428,Dynamics of emergent Cooper pairing at finite ...,cond-mat.supr-con cond-mat.mes-hall,We study the time evolution of a system of f...,"Sun, 18 Jul 2021 04:02:26 GMT",10.1103/PhysRevB.79.132504,"[[Yuzbashyan, Emil A., ], [Tsyplyatyev, Oleksa...",2,"[Yuzbashyan Emil A., Tsyplyatyev Oleksandr]",2021-07-18 04:02:26+00:00,18,7,2021,"[(Yuzbashyan Emil A., Tsyplyatyev Oleksandr)]"
13137,806.3953,Short-distance thermal correlations in the XXZ...,cond-mat.str-el cond-mat.stat-mech hep-th,Recent studies have revealed much of the mat...,"Mon, 29 Nov 2021 15:11:59 GMT",10.1088/1742-5468/2008/08/P08010,"[[Boos, Herman E., ], [Damerau, Jens, ], [Göhm...",6,"[Boos Herman E., Damerau Jens, Göhmann Frank, ...",2021-11-29 15:11:59+00:00,29,11,2021,"[(Boos Herman E., Damerau Jens), (Boos Herman ..."


In [None]:
data.to_pickle('data/cond-mat-2020-2021.pkl')