# Get Data
Get data from Material Project by API

In [1]:
import sys
import os
current_dir = os.getcwd()
current_dir

'/workspaces/Bg_pre2'

In [3]:
# To import the custom module from a specific path
sys.path.insert(0, os.path.join(current_dir))

In [5]:
from mp_api.client import MPRester
import pandas as pd

# from ydata_profiling import ProfileReport
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# Set the API key for Materials Project
API_KEY = os.getenv("MP_API_KEY")

In [7]:
# Check the available attributes.
with MPRester(API_KEY) as mpr:
    # Get the list of available attributes
    available_fields = mpr.materials.summary.available_fields

In [8]:
available_fields

['builder_meta',
 'nsites',
 'elements',
 'nelements',
 'composition',
 'composition_reduced',
 'formula_pretty',
 'formula_anonymous',
 'chemsys',
 'volume',
 'density',
 'density_atomic',
 'symmetry',
 'property_name',
 'material_id',
 'deprecated',
 'deprecation_reasons',
 'last_updated',
 'origins',
 'structure',
 'task_ids',
 'uncorrected_energy_per_atom',
 'energy_per_atom',
 'formation_energy_per_atom',
 'energy_above_hull',
 'is_stable',
 'equilibrium_reaction_energy_per_atom',
 'decomposes_to',
 'xas',
 'grain_boundaries',
 'band_gap',
 'cbm',
 'vbm',
 'efermi',
 'is_gap_direct',
 'is_metal',
 'es_source_calc_id',
 'bandstructure',
 'dos',
 'dos_energy_up',
 'dos_energy_down',
 'is_magnetic',
 'ordering',
 'total_magnetization',
 'total_magnetization_normalized_vol',
 'total_magnetization_normalized_formula_units',
 'num_magnetic_sites',
 'num_unique_magnetic_sites',
 'types_of_magnetic_species',
 'bulk_modulus',
 'shear_modulus',
 'universal_anisotropy',
 'homogeneous_poisson

In [9]:
need_fields = ['material_id', 'formula_pretty', 'composition', 'band_gap', 'is_gap_direct', 'efermi', 'energy_per_atom',
                'formation_energy_per_atom', 'energy_above_hull', 'volume', 'density', 'density_atomic',
                'symmetry', 'nsites', 'structure']

In [10]:
# Get the data of non-metal, stable materials
# e_above_hull == 0
# band_gap < 3eV
with MPRester(API_KEY) as mpr:
    # Get the data for the specified fields
    docs = mpr.materials.summary.search(
        fields = need_fields,
        is_metal = False, 
        band_gap = (0, 3),
        is_stable = True,
        num_elements=(2, 4)
    )

Retrieving SummaryDoc documents: 100%|██████████| 10342/10342 [00:21<00:00, 482.78it/s]


In [11]:
docs[0]

[4m[1mMPDataDoc<SummaryDoc>[0;0m[0;0m(
[1mnsites[0;0m=40,
[1mcomposition[0;0m=Composition('Ac16 S24'),
[1mformula_pretty[0;0m='Ac2S3',
[1mvolume[0;0m=1118.407852007047,
[1mdensity[0;0m=6.535149338291522,
[1mdensity_atomic[0;0m=27.960196300176175,
[1msymmetry[0;0m=SymmetryData(crystal_system=<CrystalSystem.tet: 'Tetragonal'>, symbol='I-42d', number=122, point_group='-42m', symprec=0.1, angle_tolerance=5.0, version='2.5.0'),
[1mmaterial_id[0;0m=MPID(mp-32800),
[1mstructure[0;0m=Structure Summary
Lattice
    abc : 14.997850705538436 14.997850705538436 14.997850705538436
 angles : 144.74213868231922 144.74213868231922 50.71896322272295
 volume : 1118.407852007047
      A : np.float64(-4.54211636) np.float64(4.54211636) np.float64(13.55263383)
      B : np.float64(4.54211636) np.float64(-4.54211636) np.float64(13.55263383)
      C : np.float64(4.54211636) np.float64(4.54211636) np.float64(-13.55263383)
    pbc : True True True
PeriodicSite: Ac (2.183, 1.297, 19.29) [0.

In [12]:
# Convert the data to a pandas DataFrame
doc_dict = {
    'material_id': [doc.material_id for doc in docs],
    'formula_pretty': [doc.formula_pretty for doc in docs],
    'composition': [doc.composition for doc in docs],
    'nsites': [doc.nsites for doc in docs],
    'crystal_system': [doc.symmetry.crystal_system for doc in docs],
    'space_group': [doc.symmetry.number for doc in docs],   # 空间群代表的国际编号
    # 'point_group': [doc.symmetry.point_group for doc in docs],    # not a number
    'volume': [doc.volume for doc in docs],
    'density': [doc.density for doc in docs],
    'density_atomic': [doc.density_atomic for doc in docs],
    'efermi': [doc.efermi for doc in docs],
    'energy_per_atom': [doc.energy_per_atom for doc in docs],
    'formation_energy_per_atom': [doc.formation_energy_per_atom for doc in docs],
    'is_gap_direct': [doc.is_gap_direct for doc in docs],
    'structure': [doc.structure for doc in docs], 
    'band_gap': [doc.band_gap for doc in docs]
}

raw_df = pd.DataFrame(doc_dict)

In [13]:
raw_df.head()

Unnamed: 0,material_id,formula_pretty,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,is_gap_direct,structure,band_gap
0,mp-32800,Ac2S3,"(Ac, S)",40,Tetragonal,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,False,"[[ 2.1830569 1.2966013 19.29112704] Ac, [0....",2.2962
1,mp-867311,AcAgTe2,"(Ac, Ag, Te)",4,Cubic,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,False,"[[1.9710325 1.9710325 1.9710325] Ac, [5.913097...",0.0794
2,mp-866101,AcCrO3,"(Ac, Cr, O)",5,Cubic,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,False,"[[0. 0. 0.] Ac, [1.97214345 1.97215113 1.97213...",2.0031
3,mp-861502,AcFeO3,"(Ac, Fe, O)",5,Cubic,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,False,"[[0. 0. 0.] Ac, [1.97678086 1.9767782 1.97678...",0.9888
4,mp-1183053,AcGaO3,"(Ac, Ga, O)",5,Cubic,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,False,"[[0. 0. 0.] Ac, [1.97313105 1.97313105 1.97313...",2.8959


In [14]:
raw_df.shape

(10342, 15)

In [15]:
raw_df.drop(columns=['formula_pretty', 'structure'], inplace=True)
raw_df.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,is_gap_direct,band_gap
0,mp-32800,"(Ac, S)",40,Tetragonal,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,False,2.2962
1,mp-867311,"(Ac, Ag, Te)",4,Cubic,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,False,0.0794
2,mp-866101,"(Ac, Cr, O)",5,Cubic,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,False,2.0031
3,mp-861502,"(Ac, Fe, O)",5,Cubic,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,False,0.9888
4,mp-1183053,"(Ac, Ga, O)",5,Cubic,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,False,2.8959


In [16]:
# Save the raw data to a csv file
file_path = os.path.join(current_dir, './Data/mp_raw_data.csv')
raw_df.to_csv(file_path, index=False)

- There are a total of 10342 pieces of data from MP.
- we only use binary to quaternary materials.