In [182]:
import numpy as np
import pandas as pd
import re
import Bio
from Bio import SeqIO

In [183]:
#imports all the records from the fasta file and saves them in a list.

records = []
for record in SeqIO.parse("pdm2_neurogenic.fa", "fasta"):
    records.append(record)
records

[SeqRecord(seq=Seq('AAAAACAAAAAACCCATACAAAAACCCCGAAAAACCCGCGCCAAAATGTAAGAA...ATA', SingleLetterAlphabet()), id='pdm2_neurogenic|MEMB002A|-', name='pdm2_neurogenic|MEMB002A|-', description='pdm2_neurogenic|MEMB002A|-', dbxrefs=[]),
 SeqRecord(seq=Seq('AAAAACAAAAAACCCATACAAAAACCCCGAAAAAACCGCGCCAAAATATAAGAA...ATA', SingleLetterAlphabet()), id='pdm2_neurogenic|MEMB002C|-', name='pdm2_neurogenic|MEMB002C|-', description='pdm2_neurogenic|MEMB002C|-', dbxrefs=[]),
 SeqRecord(seq=Seq('AAGTAAAACCTATACAAAAATCCCGAAAAAATAGCCCGCCAAATTATAAGAAAA...ATA', SingleLetterAlphabet()), id='pdm2_neurogenic|MEMB002F|-', name='pdm2_neurogenic|MEMB002F|-', description='pdm2_neurogenic|MEMB002F|-', dbxrefs=[]),
 SeqRecord(seq=Seq('AAACACAAAAAACCAAATACAAAACCCGAAAAAATAACGCGCCAAAATATAAGA...TAT', SingleLetterAlphabet()), id='pdm2_neurogenic|MEMB003F|-', name='pdm2_neurogenic|MEMB003F|-', description='pdm2_neurogenic|MEMB003F|-', dbxrefs=[]),
 SeqRecord(seq=Seq('AAACATAAAAAAAAAACACATACAAAAACCCCTGAAAAAATAACGCGCCAAAAT..

In [201]:
#turns the records into a pandas dataframe 

values = [[record.seq, record.description] for record in records]
sequences = pd.DataFrame(values)
sequences.rename(columns={0 : 'Sequence', 1: 'Description'}, inplace=True)
sequences.set_index('Description', inplace=True)
sequences

Unnamed: 0_level_0,Sequence
Description,Unnamed: 1_level_1
pdm2_neurogenic|MEMB002A|-,"(A, A, A, A, A, C, A, A, A, A, A, A, C, C, C, ..."
pdm2_neurogenic|MEMB002C|-,"(A, A, A, A, A, C, A, A, A, A, A, A, C, C, C, ..."
pdm2_neurogenic|MEMB002F|-,"(A, A, G, T, A, A, A, A, C, C, T, A, T, A, C, ..."
pdm2_neurogenic|MEMB003F|-,"(A, A, A, C, A, C, A, A, A, A, A, A, C, C, A, ..."
pdm2_neurogenic|MEMB002D|+,"(A, A, A, C, A, T, A, A, A, A, A, A, A, A, A, ..."
pdm2_neurogenic|MEMB002E|+,"(A, A, A, C, A, C, A, A, A, A, A, A, A, A, C, ..."
pdm2_neurogenic|MEMB003B|+,"(A, A, A, A, C, A, C, A, A, A, A, A, A, A, A, ..."
pdm2_neurogenic|MEMB003C|+,"(A, A, A, A, A, A, T, A, A, A, A, A, A, A, C, ..."
pdm2_neurogenic|MEMB003D|+,"(A, A, A, C, A, C, A, A, A, G, A, A, A, A, A, ..."


In [202]:
#returns the number of times 'C' occurs in the string.

def C_count(sequence):
    return len(re.findall(r"C", str(sequence)))

In [203]:
#returns the number of times 'G' occurs in the string.

def G_count(sequence):
    return len(re.findall(r"G", str(sequence)))

In [204]:
#appends a column containing the counts of the letters C & G and appends the length of each sequence.

sequences["C Count"] = sequences["Sequence"].apply(lambda x: C_count(x))
sequences["G Count"] = sequences["Sequence"].apply(lambda x: G_count(x))
sequences["Length"] = sequences["Sequence"].apply(lambda x: len(x))

In [205]:
#computes the ratio of CG in relationship to the record's sequence.

sequences["CG Ratio"] = (sequences["C Count"] + sequences["G Count"]) / sequences["Length"]
sequences

Unnamed: 0_level_0,Sequence,C Count,G Count,Length,CG Ratio
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pdm2_neurogenic|MEMB002A|-,"(A, A, A, A, A, C, A, A, A, A, A, A, C, C, C, ...",158,145,700,0.432857
pdm2_neurogenic|MEMB002C|-,"(A, A, A, A, A, C, A, A, A, A, A, A, C, C, C, ...",159,140,701,0.426534
pdm2_neurogenic|MEMB002F|-,"(A, A, G, T, A, A, A, A, C, C, T, A, T, A, C, ...",158,147,700,0.435714
pdm2_neurogenic|MEMB003F|-,"(A, A, A, C, A, C, A, A, A, A, A, A, C, C, A, ...",158,147,718,0.424791
pdm2_neurogenic|MEMB002D|+,"(A, A, A, C, A, T, A, A, A, A, A, A, A, A, A, ...",166,142,749,0.411215
pdm2_neurogenic|MEMB002E|+,"(A, A, A, C, A, C, A, A, A, A, A, A, A, A, C, ...",158,135,735,0.398639
pdm2_neurogenic|MEMB003B|+,"(A, A, A, A, C, A, C, A, A, A, A, A, A, A, A, ...",166,146,756,0.412698
pdm2_neurogenic|MEMB003C|+,"(A, A, A, A, A, A, T, A, A, A, A, A, A, A, C, ...",154,142,703,0.421053
pdm2_neurogenic|MEMB003D|+,"(A, A, A, C, A, C, A, A, A, G, A, A, A, A, A, ...",154,142,700,0.422857


In [206]:
#returns the number of matching letters between two sequences.

def matches(sequence1, sequence2):
    alignments = pairwise2.align.globalxx(str(sequence1), str(sequence2))
    x = format_alignment(*alignments[0])
    return int(re.findall(r'\d+', x)[0])

In [207]:
#returns a list of lists containing the number matches for the current row in relation to the other rows in the table.

def pairs(table):
    current = []
    for i in np.arange(table.shape[0]):
        temp_vals = []
        for j in np.arange(table.shape[0]):
            temp_vals.append(matches(table["Sequence"][i], table["Sequence"][j]))
            if len(temp_vals) == (table.shape[0]):
                current.append(temp_vals)
    return current
                    

In [208]:
sequences["Matches"] = pairs(sequences)
sequences

Unnamed: 0_level_0,Sequence,C Count,G Count,Length,CG Ratio,Matches
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pdm2_neurogenic|MEMB002A|-,"(A, A, A, A, A, C, A, A, A, A, A, A, C, C, C, ...",158,145,700,0.432857,"[700, 689, 637, 646, 656, 638, 646, 683, 647]"
pdm2_neurogenic|MEMB002C|-,"(A, A, A, A, A, C, A, A, A, A, A, A, C, C, C, ...",159,140,701,0.426534,"[689, 701, 642, 650, 658, 644, 650, 687, 652]"
pdm2_neurogenic|MEMB002F|-,"(A, A, G, T, A, A, A, A, C, C, T, A, T, A, C, ...",158,147,700,0.435714,"[637, 642, 700, 644, 652, 630, 648, 645, 638]"
pdm2_neurogenic|MEMB003F|-,"(A, A, A, C, A, C, A, A, A, A, A, A, C, C, A, ...",158,147,718,0.424791,"[646, 650, 644, 718, 691, 672, 691, 650, 671]"
pdm2_neurogenic|MEMB002D|+,"(A, A, A, C, A, T, A, A, A, A, A, A, A, A, A, ...",166,142,749,0.411215,"[656, 658, 652, 691, 749, 689, 699, 659, 676]"
pdm2_neurogenic|MEMB002E|+,"(A, A, A, C, A, C, A, A, A, A, A, A, A, A, C, ...",158,135,735,0.398639,"[638, 644, 630, 672, 689, 735, 682, 643, 662]"
pdm2_neurogenic|MEMB003B|+,"(A, A, A, A, C, A, C, A, A, A, A, A, A, A, A, ...",166,146,756,0.412698,"[646, 650, 648, 691, 699, 682, 756, 650, 668]"
pdm2_neurogenic|MEMB003C|+,"(A, A, A, A, A, A, T, A, A, A, A, A, A, A, C, ...",154,142,703,0.421053,"[683, 687, 645, 650, 659, 643, 650, 703, 653]"
pdm2_neurogenic|MEMB003D|+,"(A, A, A, C, A, C, A, A, A, G, A, A, A, A, A, ...",154,142,700,0.422857,"[647, 652, 638, 671, 676, 662, 668, 653, 700]"
