# Solution-2
Analyze the unit cell parameters of proteins and protein-protein complexes in the PDB.

In [None]:
from pyspark.sql import Row, SparkSession
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.io import mmtfReader
import matplotlib.pyplot as plt

#### Configure Spark Session and Spark Context

In [None]:
spark = SparkSession.builder.appName("Solution-2").getOrCreate()

#### Read a  sample of the PDB

In [None]:
path = "../resources/mmtf_full_sample"
pdb = mmtfReader.read_sequence_file(path)

### TODO-1 Restrict the analysis to proteins only

In [None]:
pdb = pdb.filter(ContainsLProteinChain())

### Remove structures without unit cell data

In [None]:
pdb = pdb.filter(lambda t: t[1].unit_cell != None)

### TODO-2 Define method to create a Row with unit cell data

In [None]:
def calcProperties(s):
    structure_id = s[0]
    space_group = s[1].space_group
    a, b, c, alpha, beta, gamma = s[1].unit_cell

    return Row(structure_id, space_group, a, b, c, alpha, beta, gamma)

### TODO-3: Map structures to properties

In [None]:
rows = pdb.map(lambda s: calcProperties(s))

### TODO-4: Create a dataset from the RDD

In [None]:
col_names = ["structureId", "spaceGroup", "a", "b", "c", "alpha", "beta", "gamma"]
summary = pythonRDDToDataset.get_dataset(rows, col_names).cache()

## Done: Show some details about this dataset

In [None]:
summary.columns

In [None]:
summary.printSchema()

In [None]:
summary.toPandas().head(10)

## Group data by space group and count occurances

In [None]:
df = summary.groupBy("spaceGroup")\
            .count()\
            .sort("count", ascending=False)\
            .toPandas()

df.head(10)

## Plot histogram for the top 10 space groups

In [None]:
df.head(10).plot(x='spaceGroup', y='count', kind='bar');

In [None]:
spark.stop()