In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [51]:
dataset1 = pd.read_csv("Datasets/pdb_data_seq.csv")
dataset2 = pd.read_csv("Datasets/pdb_data_no_dups.csv")

In [52]:
dataset1.head()

Unnamed: 0,structureId,chainId,sequence,residueCount,macromoleculeType
0,100D,A,CCGGCGCCGG,20,DNA/RNA Hybrid
1,100D,B,CCGGCGCCGG,20,DNA/RNA Hybrid
2,101D,A,CGCGAATTCGCG,24,DNA
3,101D,B,CGCGAATTCGCG,24,DNA
4,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154,Protein


In [53]:
dataset2.head()

Unnamed: 0,structureId,classification,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear
0,100D,DNA-RNA HYBRID,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0
1,101D,DNA,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0
2,101M,OXYGEN TRANSPORT,X-RAY DIFFRACTION,Protein,154,2.07,18112.8,,,3.09,60.2,"3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...",9.0,1999.0
3,102D,DNA,X-RAY DIFFRACTION,DNA,24,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,"pH 7.00, VAPOR DIFFUSION, SITTING DROP, temper...",7.0,1995.0
4,102L,HYDROLASE(O-GLYCOSYL),X-RAY DIFFRACTION,Protein,165,1.74,18926.61,,,2.75,55.28,,,1993.0


In [54]:
dataset1.shape

(467304, 5)

In [55]:
dataset2.shape

(141401, 14)

In [56]:
len(dataset2["classification"].unique())

5051

In [57]:
dataset2["classification"].unique()

array(['DNA-RNA HYBRID', 'DNA', 'OXYGEN TRANSPORT', ...,
       'OXIDOREDUCTASE(CHNH(D)-NAD+ OR NADP+(A))',
       'ELECTRON TRANSPORT(IRON)', 'antimicrobial'], dtype=object)

In [70]:
df = dataset1.merge(dataset2, on="structureId")
df.drop(columns=["residueCount_x", "macromoleculeType_x"], inplace=True)
df.rename(columns = {'residueCount_y':'residueCount'}, inplace = True)
df.rename(columns = {'macromoleculeType_y':'macromoleculeType'}, inplace = True)

In [72]:
df.head()

Unnamed: 0,structureId,chainId,sequence,classification,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear
0,100D,A,CCGGCGCCGG,DNA-RNA HYBRID,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0
1,100D,B,CCGGCGCCGG,DNA-RNA HYBRID,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0
2,101D,A,CGCGAATTCGCG,DNA,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0
3,101D,B,CGCGAATTCGCG,DNA,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0
4,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,OXYGEN TRANSPORT,X-RAY DIFFRACTION,Protein,154,2.07,18112.8,,,3.09,60.2,"3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...",9.0,1999.0


In [73]:
df.columns

Index(['structureId', 'chainId', 'sequence', 'classification',
       'experimentalTechnique', 'macromoleculeType', 'residueCount',
       'resolution', 'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear'],
      dtype='object')

In [74]:
df = df.iloc[:,[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 3]]

In [75]:
df.head()

Unnamed: 0,structureId,chainId,sequence,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear,classification
0,100D,A,CCGGCGCCGG,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0,DNA-RNA HYBRID
1,100D,B,CCGGCGCCGG,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0,DNA-RNA HYBRID
2,101D,A,CGCGAATTCGCG,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0,DNA
3,101D,B,CGCGAATTCGCG,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0,DNA
4,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,X-RAY DIFFRACTION,Protein,154,2.07,18112.8,,,3.09,60.2,"3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...",9.0,1999.0,OXYGEN TRANSPORT


In [76]:
df.isna().sum()

structureId                      0
chainId                         10
sequence                        28
experimentalTechnique            0
macromoleculeType            35770
residueCount                     0
resolution                   21663
structureMolecularWeight         0
crystallizationMethod       156395
crystallizationTempK        153452
densityMatthews              81504
densityPercentSol            81381
pdbxDetails                  85681
phValue                     130455
publicationYear              57636
classification                   4
dtype: int64

In [77]:
df.shape

(471149, 16)

In [78]:
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(columns=["index"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["index"], inplace=True)


In [79]:
df.head()

Unnamed: 0,structureId,chainId,sequence,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear,classification
0,102D,A,CGCAAATTTGCG,X-RAY DIFFRACTION,DNA,24,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,"pH 7.00, VAPOR DIFFUSION, SITTING DROP, temper...",7.0,1995.0,DNA
1,102D,B,CGCAAATTTGCG,X-RAY DIFFRACTION,DNA,24,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,"pH 7.00, VAPOR DIFFUSION, SITTING DROP, temper...",7.0,1995.0,DNA
2,110D,A,CGGCCG,X-RAY DIFFRACTION,DNA,6,1.9,2337.73,"VAPOR DIFFUSION, SITTING DROP",277.0,2.9,57.63,"pH 6.60, VAPOR DIFFUSION, SITTING DROP, temper...",6.6,1993.0,DNA
3,111D,A,CGCAAATTGGCG,X-RAY DIFFRACTION,DNA,24,2.25,7374.83,"VAPOR DIFFUSION, SITTING DROP",277.0,2.29,46.25,"pH 6.60, VAPOR DIFFUSION, SITTING DROP, temper...",6.6,1989.0,DNA
4,111D,B,CGCAAATTGGCG,X-RAY DIFFRACTION,DNA,24,2.25,7374.83,"VAPOR DIFFUSION, SITTING DROP",277.0,2.29,46.25,"pH 6.60, VAPOR DIFFUSION, SITTING DROP, temper...",6.6,1989.0,DNA


In [80]:
df.shape

(215142, 16)

In [81]:
df.to_csv("Datasets/proteinDataset.csv")