## Combine, filter and save XRD data

In [11]:
#loads and combines all .csv files for XRD data into one file

import pandas as pd
import glob
import os

files = glob.glob('raw_data/rcsb_pdb_custom_report_*.csv')

if(os.path.exists('raw_data/XRD_combined.csv') and os.path.isfile('raw_data/XRD_combined.csv')):
    os.remove('raw_data/XRD_combined.csv')
else:
    pass

for i in files:
    df = pd.read_csv(i, header=1)
    df = df.fillna(method='ffill')
    df = df[df['Experimental Method'].isin(['X-RAY DIFFRACTION'])]
    df = df.drop(['EM Resolution (Å)', 'Unnamed: 22', 'Ligand','Entry Id (Polymer Entity Identifiers)'], axis=1)
    df.to_csv('raw_data/XRD_combined.csv', mode='a')

In [14]:
#filters and cleans dataframe for further use
#second dataframe is then created and saved which does not contain any duplicates based on 'Entry ID'

data = pd.read_csv('raw_data/XRD_combined.csv', encoding = 'utf8')
data = data.drop(['Unnamed: 0'], axis=1)

data_1 = data.drop_duplicates(subset=['Entry ID'])

if(os.path.exists('raw_data/XRD_combined_noduplicates.csv') and os.path.isfile('raw_data/XRD_combined_noduplicates.csv')):
    os.remove('raw_data/XRD_combined_noduplicates.csv')
else:
    data_1.to_csv('raw_data/XRD_combined_noduplicates.csv', mode='a')

### Initial data processing for analysis

In [15]:
#basic column type definitions and changes to numeric dtype

float_columns = ['Matthews Coefficient','Percent Solvent Content','pH','Temp (K)','Molecular Weight per Deposited Model',
                'Resolution (Å)','High Resolution Limit','Entity ID'] 
str_columns = ['Entry ID','Experimental Method','Crystallization Method','Crystal Growth Procedure','DOI','PDB ID',
              'Structure Title','Sequence','Source Organism','Gene Name']

for i in float_columns:
    data[i] = pd.to_numeric(data[i], errors='coerce')

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83968 entries, 0 to 83967
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Entry ID                              83968 non-null  object 
 1   Experimental Method                   83968 non-null  object 
 2   Matthews Coefficient                  83944 non-null  float64
 3   Percent Solvent Content               83944 non-null  float64
 4   Crystallization Method                83963 non-null  object 
 5   pH                                    83867 non-null  float64
 6   Crystal Growth Procedure              83965 non-null  object 
 7   Temp (K)                              83939 non-null  float64
 8   DOI                                   83659 non-null  object 
 9   PDB ID                                83968 non-null  object 
 10  Molecular Weight per Deposited Model  83944 non-null  float64
 11  Resolution (Å) 