In [7]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv("./cath-superfamily-list.txt", sep='\t',index_col=0)
df = df.dropna()
df['COMMENT'] = np.nan

In [9]:

### AUTOMATED REPLACE

def semicolon(df): #replace semicolons with commas
    ret = df[df['NAME'].str.contains(";")]['NAME'].str.replace(";", ',')
    comment = pd.Series(index=ret.index, name='COMMENT', data="S")
    return ret, comment

def lowercase(df):
    l = df[df['NAME'].str.contains(r'Protein|')]
def lowercase_start(df): #replace lowercase start with capital
    st_lower = df[df['NAME'].str[0].str.islower()]['NAME']
    st_lower = st_lower.mask(st_lower.str.contains(r'^[m|t|r|ss|ds][R|D]NA|^cAMP', regex=True)).dropna()
    ret = st_lower.str[0].str.upper() + st_lower.str[1:]
    comment = pd.Series(index=ret.index,name='COMMENT', data="L")
    return ret, comment

def trailing_stop(df): #remove trailing dots
    ret = df[df["NAME"].str.contains('\.$|,$|;$')]['NAME'].str[:-1]
    comment = pd.Series(index=ret.index, name='COMMENT', data="T")
    return ret, comment

def other_stop(df): #replace other dots with commas
    s = df[df['NAME'].str.contains("\.")]['NAME']
    s = s.mask(s.str.contains(r'\d\.\d|\.$')).dropna()
    ret = s.str.replace(".", ',')
    comment = pd.Series(index=ret.index, name='COMMENT', data="C")
    return ret, comment



def run_rename(df):
    acronRegex = re.compile(r'\w*[A-Z]\w*[A-Z]\w*|C-[T|t]erminal|N-[T|t]erminal|^[A-Z]-\w+|Hippel\-Lindau|Willebrand|Kunitz|Enterococc|^[A-Z]\W?$|^\d*[A-Z]\d*\W?$')
    ret = pd.Series()
    for sfam in df.itertuples():
        l = sfam.NAME.split()
        new_name = [l[0]]
        for word in l[1:]:
            if acronRegex.search(word):
                new_name.append(word)
            else:
                new_name.append(word.lower())
        ret[sfam.Index] = " ".join(new_name)
    ret = ret[ret != df.NAME]
    comment = pd.Series(index=ret.index,name='COMMENT', data="R")
    return ret, comment

    

In [10]:
def implement_replacements(df):
    ret_df = df[['NAME','COMMENT']]
    ret_df['OLD_NAME'] = ret_df['NAME']
    for f in [run_rename, semicolon, lowercase_start, other_stop, trailing_stop]:
        r, c = f(ret_df)
        ret_df['NAME'] = r.combine_first(ret_df['NAME'])
        ret_df['COMMENT'] = c.combine(ret_df['COMMENT'], lambda c, r:str(c)+str(r))
    ret_df['COMMENT'] = ret_df["COMMENT"].str.replace("nan", '')
    return ret_df.replace('', np.nan, regex=True)

In [11]:
t = implement_replacements(df)

In [12]:
t.dropna()

Unnamed: 0_level_0,NAME,COMMENT,OLD_NAME
# CATH_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.10.8.160,"DNA primase S, domain 2",S,DNA primase S; domain 2
1.10.8.190,"Carbon monoxide dehydrogenase alpha subunit, c...",CR,Carbon monoxide dehydrogenase alpha subunit. C...
1.10.8.270,Putative rabgap domain of human tbc1 domain fa...,L,putative rabgap domain of human tbc1 domain fa...
1.10.8.290,Uncharacterized protein sp1917 domain,L,uncharacterized protein sp1917 domain
1.10.8.300,Putative atpase (yp_676785.1),L,putative atpase (yp_676785.1)
1.10.10.10,Winged helix-like DNA-binding domain superfami...,R,Winged helix-like DNA-binding domain superfami...
1.10.10.460,"Ribonuclease hii, domain 2",CR,Ribonuclease hii. Domain 2
1.10.10.470,"Maltooligosyl trehalose synthase, domain 4",S,Maltooligosyl trehalose synthase; domain 4
1.10.10.480,"Phosphofructokinase, domain 3",S,Phosphofructokinase; domain 3
1.10.10.520,"Ubiquitin activating enzymes (uba3), chain: B,...",CR,"Ubiquitin activating enzymes (Uba3). Chain: B,..."
