In [29]:
import os
os.chdir(os.getcwd())

In [30]:
import pandas as pd

In [62]:
def predict_operons(filepath, threshold=50):
    # Read the PTT file into a dataframe
    df = pd.read_csv(filepath, skiprows=2, sep='\t')

    # Rename columns
    df.columns = ['Location', 'Strand', 'Length', 'PID', 'Gene', 'Synonym', 'Code', 'COG', 'Product']

    # Extract start and stop positions
    start = pd.Series([int(loc.split('..')[0]) for loc in df['Location']])
    stop = pd.Series([int(loc.split('..')[1]) for loc in df['Location']])
    df['Start'] = start
    df['Stop'] = stop

    # Sort by start position
    df = df.sort_values('Start')

    # Initialize variables
    operons_list = []
    operon = [df['Gene'].iloc[0]]
    prev_stop = df['Stop'].iloc[0]

    # Iterate through genes
    for i in range(1, len(df)):
        curr_start = df['Start'].iloc[i]
        curr_stop = df['Stop'].iloc[i]
        curr_gene = df['Gene'].iloc[i]

        # Check if current gene is in same operon as previous gene and thereby setting the difference to less than 50 as mentioned in the question
        if df['Strand'].iloc[i] == df['Strand'].iloc[i-1] and curr_start - prev_stop < threshold:
            operon.append(curr_gene)
        else:
            operons_list.append(operon)
            operon = [curr_gene]

        prev_stop = curr_stop
    operons_list.append(operon)

    # Return the number of operons and a brief list of the operons
    return (len(operons_list),operons_list)


In [63]:
predict_operons('E_coli_K12_MG1655.ptt')

(2669,
 [['thrL'],
  ['thrA', 'thrB', 'thrC'],
  ['yaaX'],
  ['yaaA'],
  ['yaaJ'],
  ['talB'],
  ['mog'],
  ['yaaH'],
  ['yaaW', 'yaaI'],
  ['dnaK'],
  ['dnaJ'],
  ['insL'],
  ['hokC', 'mokC'],
  ['nhaA'],
  ['nhaR'],
  ['insB', 'insA'],
  ['rpsT'],
  ['yaaY', 'ribF', 'ileS', 'lspA'],
  ['fkpB', 'ispH'],
  ['rihC'],
  ['dapB'],
  ['carA', 'carB'],
  ['caiF'],
  ['caiE', 'caiD'],
  ['caiC'],
  ['caiB'],
  ['caiA', 'caiT'],
  ['fixA', 'fixB'],
  ['fixC', 'fixX'],
  ['yaaU'],
  ['kefF', 'kefC'],
  ['folA'],
  ['apaH', 'apaG', 'rsmA', 'pdxA', 'surA'],
  ['lptD'],
  ['djlA'],
  ['rluA', 'rapA'],
  ['polB'],
  ['araD'],
  ['araA', 'araB'],
  ['araC'],
  ['yabI'],
  ['thiQ', 'thiP', 'thiB'],
  ['sgrR'],
  ['sgrT'],
  ['setA'],
  ['leuD', 'leuC', 'leuB', 'leuA'],
  ['leuL'],
  ['leuO'],
  ['ilvI', 'ilvH'],
  ['cra'],
  ['mraZ',
   'rsmH',
   'ftsL',
   'ftsI',
   'murE',
   'murF',
   'mraY',
   'murD',
   'ftsW',
   'murG'],
  ['murC', 'ddlB', 'ftsQ', 'ftsA'],
  ['ftsZ'],
  ['lpxC'],
  ['secM

In [64]:
predict_operons('Halobacterium_NRC1.ptt')

(1464,
 [['-', 'yvrO', '-'],
  ['-'],
  ['glmS', 'graD5', 'graD2'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-', '-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-', '-'],
  ['-', '-'],
  ['-'],
  ['-', '-'],
  ['-'],
  ['-', '-'],
  ['ntp'],
  ['-', '-', '-'],
  ['ugd'],
  ['graD6'],
  ['-', '-'],
  ['rfbU1'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['lpg'],
  ['-'],
  ['lpb'],
  ['galE2'],
  ['graD3'],
  ['gmd'],
  ['-', '-'],
  ['-'],
  ['-', '-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['-'],
  ['moaE'],
  ['moeB'],
  ['moaA'],
  ['moeA2', 'moeA1'],
  ['pimT1', '-'],
  ['-'],
  ['-'],
  ['gapB'],
  ['hsp2'],
  ['rimK'],
  ['rpl10e'],
  ['cspD1'],
  ['-'],
  ['serA3'],
  ['-'],
  ['rmeM', 'rmeS'],
  ['rmeR', '-'],
  ['-'],
  ['-'],
  ['yusZ1'],
  ['-'],
  ['-', '-'],
  ['-', '-'],
  ['-'],
  ['trp1', '-'],
  ['-', '-'],
  ['-'],
  ['hsp4'],
  ['-'],
  ['-']

In [65]:
predict_operons('Synechocystis_PCC6803_uid159873.ptt')

(2521,
 [['slr0612'],
  ['slr0613'],
  ['sll0558'],
  ['sll1214'],
  ['sll1213'],
  ['rfbD'],
  ['psbA2'],
  ['speA'],
  ['ligA'],
  ['slr1315'],
  ['fecC'],
  ['fecD'],
  ['fecE', 'fecB'],
  ['iutA'],
  ['pchR'],
  ['sll1204', 'sll1203', 'sll1202'],
  ['fhuA'],
  ['pcrR'],
  ['sll1407'],
  ['fhuA'],
  ['sll1405', 'exbB'],
  ['slr1484'],
  ['slr1485'],
  ['slr1488'],
  ['pchR'],
  ['fhuA', 'fecB'],
  ['fecB', 'slr1493'],
  ['slr1494'],
  ['sll1401'],
  ['sll1400', 'ssl2733', 'sll1399', 'psbW'],
  ['slr1495'],
  ['sll1397'],
  ['sll1396'],
  ['hypD'],
  ['rfbD', 'msrA'],
  ['glgA'],
  ['sll1392'],
  ['slr1501'],
  ['slr1113'],
  ['slr1114', 'slr1115', 'slr1116'],
  ['slr1117'],
  ['sll1064'],
  ['rffM'],
  ['sll1063'],
  ['slr1119'],
  ['sll1062', 'sll1061', 'sll1060', 'ssl2069'],
  ['ssr1853'],
  ['hofD'],
  ['adk'],
  ['dapB'],
  ['trx'],
  ['purL'],
  ['slr1122'],
  ['sll1054', 'sll1053'],
  ['gmk'],
  ['gpmB'],
  ['crtX'],
  ['sll1052'],
  ['cpcF'],
  ['sll1049'],
  ['slr1127'],
  [

In [66]:
predict_operons('B_subtilis_168.ptt')

(2662,
 [['dnaA'],
  ['dnaN'],
  ['yaaA', 'recF', 'yaaB'],
  ['gyrB'],
  ['gyrA'],
  ['yaaC'],
  ['guaB'],
  ['dacA'],
  ['yaaD', 'yaaE'],
  ['serS'],
  ['dck', 'dgk'],
  ['yaaH'],
  ['yaaI'],
  ['tadA'],
  ['dnaX', 'yaaK', 'recR', 'yaaL'],
  ['bofA'],
  ['csfB'],
  ['xpaC', 'yaaN'],
  ['yaaO', 'tmk'],
  ['yaaQ', 'yaaR', 'holB', 'yaaT', 'yabA'],
  ['yabB', 'yazA', 'yabC'],
  ['abrB'],
  ['metS'],
  ['yabD'],
  ['yabE'],
  ['rnmV', 'ksgA'],
  ['yabG'],
  ['veg'],
  ['sspF'],
  ['ipk'],
  ['purR', 'yabJ'],
  ['spoVG'],
  ['glmU', 'prs'],
  ['ctc'],
  ['pth'],
  ['yabK'],
  ['mfd'],
  ['spoVT'],
  ['yabM', 'yabN', 'yabO'],
  ['yabP', 'yabQ', 'divIC'],
  ['yabR'],
  ['spoIIE'],
  ['yabS', 'yabT'],
  ['tilS', 'hprT'],
  ['ftsH'],
  ['coaX', 'hslO', 'yacD'],
  ['cysK'],
  ['pabB', 'pabA', 'pabC', 'sul', 'folB', 'folK', 'yazB', 'dusB'],
  ['lysS'],
  ['ctsR', 'mcsA', 'mcsB', 'clpC'],
  ['radA', 'yacK'],
  ['yacL', 'ispD', 'ispF'],
  ['gltX'],
  ['cysE', 'cysS', 'mrnC', 'rlmB', 'yacP'],
  ['si

In [67]:
#function to predict the operons in GFF file using the same logic as that for the PTT file
def predict_operons_gff(filepath, threshold=50):
    # Read the GFF file into a dataframe
    df = pd.read_csv(filepath, sep='\t', comment='#', header=None, usecols=[0, 3, 4, 6], names=['contig', 'start', 'stop','strand'])
    df = df.sort_values(['contig','start'])
    operons_list = []
    operon = [df['contig'].iloc[0]]
    prev_stop = df['stop'].iloc[0]
    for i in range(1, len(df)):
        curr_start = df['start'].iloc[i]
        curr_stop = df['stop'].iloc[i]
        curr_gene = df['contig'].iloc[i]
        if df['strand'].iloc[i] == df['strand'].iloc[i-1] and curr_start - prev_stop < threshold:
            operon.append(curr_gene)
        else:
            operons_list.append(operon)
            operon = [curr_gene]
        prev_stop = curr_stop
    operons_list.append(operon)
    return len(operons_list),operons_list

In [68]:
predict_operons_gff('2088090036.gff')

(12108,
 [['HCP21_1000'],
  ['HCP21_10000', 'HCP21_10001', 'HCP21_10002'],
  ['HCP21_10003', 'HCP21_10004', 'HCP21_10005'],
  ['HCP21_10006', 'HCP21_10007', 'HCP21_10007', 'HCP21_10008'],
  ['HCP21_10009',
   'HCP21_1001',
   'HCP21_10010',
   'HCP21_10010',
   'HCP21_10011',
   'HCP21_10012'],
  ['HCP21_10013', 'HCP21_10014', 'HCP21_10015', 'HCP21_10016', 'HCP21_10017'],
  ['HCP21_10018', 'HCP21_10019'],
  ['HCP21_1002'],
  ['HCP21_10020'],
  ['HCP21_10020', 'HCP21_10021', 'HCP21_10022'],
  ['HCP21_10023', 'HCP21_10024', 'HCP21_10025'],
  ['HCP21_10026', 'HCP21_10027'],
  ['HCP21_10028'],
  ['HCP21_10029'],
  ['HCP21_1003', 'HCP21_10030'],
  ['HCP21_10031'],
  ['HCP21_10032'],
  ['HCP21_10033'],
  ['HCP21_10034'],
  ['HCP21_10035'],
  ['HCP21_10035'],
  ['HCP21_10036'],
  ['HCP21_10037'],
  ['HCP21_10038', 'HCP21_10039', 'HCP21_1004', 'HCP21_10040'],
  ['HCP21_10041', 'HCP21_10042'],
  ['HCP21_10043', 'HCP21_10044', 'HCP21_10045', 'HCP21_10046'],
  ['HCP21_10046', 'HCP21_10047'],
  ['