(The spirit when programming: "*Why spend 5 days doing some work when you can spend 5 weeks automating it!*")

# Creating your own kernel / environment

Create a Python environment (where packages are installed):
- Open a terminal
- `python3 -m venv bioinfo`
- `source bioinfo/bin/activate`

Link it to a Jupyter kernel (so that your code execute with it):
- `pip install ipykernel`
- `python3 -m ipykernel install --user --name bioinfo --display-name "Python (bioinfo)"`

Then you can install new packages:
- `pip install pandas`

In your notebook, you'll need to `Kernel` -> `Change kernel` to this new kernel. It is probably safe to then `Kernel` -> `Restart & clear output`. Make sure this new kernel is the one active (upper right corner of your screen, just below the button `Control Panel`).

# Working with real data

In [1]:
import pandas
# We could import all the functions of pandas like this : from pandas import * (not recommanded)
# We should : import pandas as pd (better option) -> Use pd.read_table for exemple instead of pandas.read_table

## Read from / write to TSV and CSV files (in and out of Excel / R)

(Doc: https://pandas.pydata.org/docs/reference/api/pandas.read_table.html#pandas.read_table)

In [2]:
df = pandas.read_table("kmers.tsv")

In [3]:
df

Unnamed: 0,Seq,Id,Count
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7
...,...,...,...
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7


(Doc: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html)

In [4]:
df.to_csv("test.csv")

## Dataframe manipulation

(.head(), .tail(), .shape, .colums, sum(), len(), .describe(), ["Col"], .drop())

In [5]:
df.head()

Unnamed: 0,Seq,Id,Count
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7


In [6]:
df.tail()

Unnamed: 0,Seq,Id,Count
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7
3999,AAACCCAGTCACTGGACACCTAAGTGTCCAC,4000,11


In [7]:
df.shape

(4000, 3)

In [8]:
df.columns

Index(['Seq', 'Id', 'Count'], dtype='object')

In [9]:
df.sum()

Seq      AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAGGACTCCAATATA...
Id                                                 8002000
Count                                               372648
dtype: object

In [10]:
df.Count

0       113422
1           93
2            5
3           88
4            7
         ...  
3995         5
3996         5
3997        10
3998         7
3999        11
Name: Count, Length: 4000, dtype: int64

In [11]:
sum(df.Count)

372648

In [12]:
len(df.Count)

4000

In [13]:
sum(df.Count)/len(df.Count)
#Average lenght of sequences

93.162

In [14]:
df.describe()

Unnamed: 0,Id,Count
count,4000.0,4000.0
mean,2000.5,93.162
std,1154.844867,1804.997654
min,1.0,5.0
25%,1000.75,7.0
50%,2000.5,14.0
75%,3000.25,42.0
max,4000.0,113422.0


In [15]:
df.Seq

0       AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
1       CAGGACTCCAATATAGAGATAAGTTAATGTC
2       TATGTAATTGGTTCCAGTGTGAGTCATTAAA
3       GATATTTTCGAAAAGTGGGATTTTTTAAACC
4       CTCCATCTCAGGTATTAGAATGAATGCTTAC
                     ...               
3995    AGCTGCAGGAACTCCCTCGTCACAGCTTAAA
3996    CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA
3997    GTCTGCCTTTATGGCCTTTGTACTCAAAGAA
3998    AGACTATAGTGAGCTCAGGTGATTGATACTC
3999    AAACCCAGTCACTGGACACCTAAGTGTCCAC
Name: Seq, Length: 4000, dtype: object

In [16]:
df["Seq"]
#Same thing as df.column name -> df.Seq

0       AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
1       CAGGACTCCAATATAGAGATAAGTTAATGTC
2       TATGTAATTGGTTCCAGTGTGAGTCATTAAA
3       GATATTTTCGAAAAGTGGGATTTTTTAAACC
4       CTCCATCTCAGGTATTAGAATGAATGCTTAC
                     ...               
3995    AGCTGCAGGAACTCCCTCGTCACAGCTTAAA
3996    CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA
3997    GTCTGCCTTTATGGCCTTTGTACTCAAAGAA
3998    AGACTATAGTGAGCTCAGGTGATTGATACTC
3999    AAACCCAGTCACTGGACACCTAAGTGTCCAC
Name: Seq, Length: 4000, dtype: object

(Arithmetics on columns)

In [17]:
df["Test"] = 23
df
#Added a new column with 23 as a value

Unnamed: 0,Seq,Id,Count,Test
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,23
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93,23
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5,23
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88,23
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7,23
...,...,...,...,...
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5,23
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5,23
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10,23
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7,23


In [18]:
df["frac"] = df.Count / sum(df.Count)*100
df
#Added a column with fraction of count for every sequence 

Unnamed: 0,Seq,Id,Count,Test,frac
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,23,30.436766
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93,23,0.024957
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5,23,0.001342
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88,23,0.023615
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7,23,0.001878
...,...,...,...,...,...
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5,23,0.001342
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5,23,0.001342
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10,23,0.002683
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7,23,0.001878


In [19]:
#Calculate how many sequences have a count higher than something (here a 100) -> We put it in the object o
#True = 1 and False = 0
o = df.Count >= 100
o

0        True
1       False
2       False
3       False
4       False
        ...  
3995    False
3996    False
3997    False
3998    False
3999    False
Name: Count, Length: 4000, dtype: bool

In [20]:
#Sum of all the True answers -> Higher count than a 100
sum(o)

543

In [21]:
#Extract the values of all the True entries
df.Seq[o]

0       AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
6       CTTCCATGGCTGTCCGGATCGCCGCACTGCA
7       GCACCAGGCCTTTCTCTAGAAGTCCTGAGAC
11      ATCAATCGACTCAGATGATCAGTTTTGGTAG
15      GGCCTGGGCTGGAAACAGCTCTGTGTGTGAA
                     ...               
3969    AGTTTTCTAAAAAGGGGGAGAGTTGTGAAAG
3973    ATTATCTGGGCGTGGTGGCATGTGCCTGTAG
3975    CCTATGCTTTCCTTGGCATCGGCTACACATC
3987    AAGGGTGTCCTGCTCCTTGACCACGATGGGG
3994    AACCCAAGGAAAGAGAAATGCTGGGGTGTAT
Name: Seq, Length: 543, dtype: object

## Guided exercise(s) here...

### 1) Add a column with nucleotide count (A)
([loc](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)[*row*,*col*], .[apply](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)())

In [22]:
#From yesterday's class
def count_nuc(my_seq, nuc):
    total = 0
    for c in my_seq:
        if c == nuc:
            total = total + 1
    return total

(First step: We'll try to first apply it to the second row, counting As only)

In [23]:
#Calcul only on the first sequence
count_nuc(df.Seq[1], "A")

12

In [24]:
#Use the .apply (check link for the arguments ex: axis)
df.apply(len, axis = 1)

0       5
1       5
2       5
3       5
4       5
       ..
3995    5
3996    5
3997    5
3998    5
3999    5
Length: 4000, dtype: int64

In [25]:
#Modify our function
def count_A (row):
    return count_nuc(row.Seq, "A")

In [26]:
#Calculate the number of As in each sequence
df.apply(count_A, axis=1)

0       31
1       12
2        9
3       10
4        9
        ..
3995     9
3996     6
3997     7
3998     9
3999    10
Length: 4000, dtype: int64

In [27]:
#Lets add the column
df["A"] = df.apply(count_A, axis=1)
df

Unnamed: 0,Seq,Id,Count,Test,frac,A
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,23,30.436766,31
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93,23,0.024957,12
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5,23,0.001342,9
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88,23,0.023615,10
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7,23,0.001878,9
...,...,...,...,...,...,...
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5,23,0.001342,9
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5,23,0.001342,6
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10,23,0.002683,7
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7,23,0.001878,9


(Doc: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html)

In [28]:
#loc shows the whole rows of sequences with count higher than a 100
df.loc[o]

Unnamed: 0,Seq,Id,Count,Test,frac,A
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,23,30.436766,31
6,CTTCCATGGCTGTCCGGATCGCCGCACTGCA,7,299,23,0.080237,4
7,GCACCAGGCCTTTCTCTAGAAGTCCTGAGAC,8,128,23,0.034349,7
11,ATCAATCGACTCAGATGATCAGTTTTGGTAG,12,252,23,0.067624,9
15,GGCCTGGGCTGGAAACAGCTCTGTGTGTGAA,16,330,23,0.088555,6
...,...,...,...,...,...,...
3969,AGTTTTCTAAAAAGGGGGAGAGTTGTGAAAG,3970,112,23,0.030055,11
3973,ATTATCTGGGCGTGGTGGCATGTGCCTGTAG,3974,130,23,0.034885,4
3975,CCTATGCTTTCCTTGGCATCGGCTACACATC,3976,352,23,0.094459,5
3987,AAGGGTGTCCTGCTCCTTGACCACGATGGGG,3988,172,23,0.046156,5


In [29]:
#Can make a new dataframe with only that : df2 = df.loc[o]

(Doc: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html)

(Whiz-kid corner: lambda expressions, https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions)

In [30]:
#lambda expression (no more need for intermediate functions)
df.apply(lambda row : count_nuc(row.Seq, "A"), axis=1)

0       31
1       12
2        9
3       10
4        9
        ..
3995     9
3996     6
3997     7
3998     9
3999    10
Length: 4000, dtype: int64

In [31]:
#Return a function directly
def row_count_nuc(nuc):
    def tmp(row):
        return count_nuc(row.Seq, nuc)
    return tmp

In [32]:
f = row_count_nuc("A")

In [33]:
df.apply(f, axis=1)

0       31
1       12
2        9
3       10
4        9
        ..
3995     9
3996     6
3997     7
3998     9
3999    10
Length: 4000, dtype: int64

### 2) Show the 10 sequences with the most number of A. How many reads do they represent? What % of the (truncated) transcriptome?
(.sort_values())

(Doc: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html)

In [35]:
df_A = df.sort_values(by="A", ascending=False)
df_A[0:10]

Unnamed: 0,Seq,Id,Count,Test,frac,A
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,23,30.436766,31
650,CAAAAAAAAAAAAACAAAAAACAAAAAAACA,651,13,23,0.003489,27
2507,AAATAACAAAAAATTAAAAAAAAAAAAAAAA,2508,5,23,0.001342,27
3678,AAAACAAAAACAAAACAAACAAACAAAAAAG,3679,26,23,0.006977,25
168,AAAAAAGATTAAAAAATTAAAAAAAAAAGAA,169,11,23,0.002952,25
2321,AATAACAGAAAGAAAACAAAAAGAAAAATAA,2322,57,23,0.015296,24
3880,AAAGAAAGAAAAAGAAAAAAAAAATAGCACA,3881,7,23,0.001878,24
2636,AAAATTAAAAAAAAAAAAAAAAAATTAGCCG,2637,6,23,0.00161,23
3491,AAAAGAAGACAAAAGAAAAGAGAAAGAAGAA,3492,7,23,0.001878,23
3186,ATAAATAAAAAGGAAAAGAAAAGAAAAGAAG,3187,24,23,0.00644,23


In [36]:
#How many reads they represent
sum(df_A[0:10].Count)

113578

In [37]:
#% of the transcriptome
df_A[0:10].Count / sum(df.Count)*100

0       30.436766
650      0.003489
2507     0.001342
3678     0.006977
168      0.002952
2321     0.015296
3880     0.001878
2636     0.001610
3491     0.001878
3186     0.006440
Name: Count, dtype: float64

### 3) How many sequences with 25 or more As? Then, check that the result is correct.
(Cond. row selection)

In [39]:
#Use brackets when there's a space in a column name
a = df_A["A"] >= 25
sum(a)

5

### 4) Clean up the dataframe (or re-load), add counts for all 4 nucl

In [89]:
df2 = pandas.read_table("kmers.tsv")

(Whiz kid corner: a function returning a function)

In [79]:
def count_C (row):
    return count_nuc(row.Seq, "C")

In [80]:
def count_T (row):
    return count_nuc(row.Seq, "T")

In [81]:
def count_G (row):
    return count_nuc(row.Seq, "G")

In [108]:
df2["A"] = df2.apply(count_A, axis=1)
df2["T"] = df2.apply(count_T, axis=1)
df2["C"] = df2.apply(count_C, axis=1)
df2["G"] = df2.apply(count_G, axis=1)
df2

Unnamed: 0,Seq,Id,Count,A,T,C,G,CG %
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,31,0,0,0,0.000000
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93,12,8,5,6,35.483871
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5,9,12,3,7,32.258065
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88,10,12,3,6,29.032258
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7,9,10,7,5,38.709677
...,...,...,...,...,...,...,...,...
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5,9,6,10,6,51.612903
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5,6,8,7,10,54.838710
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10,7,11,7,6,41.935484
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7,9,9,5,8,41.935484


### 5) Add a %GC column

In [83]:
df2.C + df2.G

0        0
1       11
2       10
3        9
4       12
        ..
3995    16
3996    17
3997    13
3998    13
3999    16
Length: 4000, dtype: int64

In [84]:
df2["T"] + df2.A

0       31
1       20
2       21
3       22
4       19
        ..
3995    15
3996    14
3997    18
3998    18
3999    15
Length: 4000, dtype: int64

In [97]:
df2["CG %"] = ((df2.C+df2.G)/(df2.A+df2["T"]+df2.C+df2.G))*100
df2

Unnamed: 0,Seq,Id,Count,A,T,C,G,CG %
0,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,1,113422,31,0,0,0,0.000000
1,CAGGACTCCAATATAGAGATAAGTTAATGTC,2,93,12,8,5,6,35.483871
2,TATGTAATTGGTTCCAGTGTGAGTCATTAAA,3,5,9,12,3,7,32.258065
3,GATATTTTCGAAAAGTGGGATTTTTTAAACC,4,88,10,12,3,6,29.032258
4,CTCCATCTCAGGTATTAGAATGAATGCTTAC,5,7,9,10,7,5,38.709677
...,...,...,...,...,...,...,...,...
3995,AGCTGCAGGAACTCCCTCGTCACAGCTTAAA,3996,5,9,6,10,6,51.612903
3996,CTGAGCTCTCTGGGAAAGTCGTGTTCCGGAA,3997,5,6,8,7,10,54.838710
3997,GTCTGCCTTTATGGCCTTTGTACTCAAAGAA,3998,10,7,11,7,6,41.935484
3998,AGACTATAGTGAGCTCAGGTGATTGATACTC,3999,7,9,9,5,8,41.935484


### 6) And find the 10 sequences with highest GC content. How many reads do they represent?
(as a bonus, store this result in a new dataframe with only columns: Seq, Id, Count and %GC. You might need a few extra "tricks" with .loc[:,["Col1", "Col2"])

In [98]:
df_CG = df2.sort_values(by="CG %", ascending=False)
df_CG[0:10]

Unnamed: 0,Seq,Id,Count,A,T,C,G,CG %
1735,CTGCCCGCGCCCGCCGCCCAGGACCCCGCAC,1736,6,3,1,19,8,87.096774
1508,ACGCACCCCTCCCCGGCCTGGGCGGCGGCGA,1509,72,3,2,15,11,83.870968
963,CCGCGCCGCCCGGGCACCATGGCGGGGAAGG,964,7,4,1,12,14,83.870968
233,ACCCGGCGCCCGGCCAGTCCTGCGCGTCCCC,234,38,2,3,17,9,83.870968
3390,GCACGGGCGAAGGGGCCGCGGCCGCATGCCC,3391,64,4,1,12,14,83.870968
1222,CTGCGGGGGGCCTGCGGAGACGGCGCCCGCA,1223,5,3,2,11,15,83.870968
1751,CGGCGGTTGGCGGGGCACCACGGGAGGGGCC,1752,19,3,2,9,17,83.870968
3021,GGGTCCGGCGCCGCCGGCTGCGGCTTCGCGA,3022,21,1,4,12,14,83.870968
1899,AGGACTGGGGGGAGGCGGGCACCCCAGCGGG,1900,42,5,1,8,17,80.645161
2320,GCAGGTCGCCCTGGGGTGCCCGCGCGTGGGA,2321,9,2,4,10,15,80.645161


In [99]:
#How many reads they represent
sum(df_CG[0:10].Count)

283

### 7) How many sequences with ≥ 50%GC (1453)? What is the %GC of all the sequences joined together (44.8%)? How many sequence have %GC above this average value (2104)?

In [101]:
#How many sequences with ≥ 50%GC (1453)?
cg = df_CG["CG %"] >= 50
sum(cg)

1453

In [103]:
#What is the %GC of all the sequences joined together (44.8%)?
sum(df2["CG %"])/len(df2["CG %"])

44.83629032258051

In [105]:
#How many sequence have %GC above this average value (2104)?
cg44 = df_CG["CG %"] >= 44.83629032258051
sum(cg44)

2104

### (*Challenge!*): Which sequence would form the longest helix linking the 5' and 3' extremities (no overhang)?
(Answer: ATGAATTGAGTTGTGTCCCCCCAAAATTCAT, 7 base pairs, line number 2827)
(Working with GPT-4: https://chatgpt.com/share/4687f755-e276-4d69-94c3-68be6dc5b584  Search for "Can you write a function that...")

In [162]:
#ChatGPT answer
def longest_helix(sequences):
    complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    def find_longest_helix(seq):
        max_length = 0
        length = len(seq)
        for i in range(1, length+1):
            if all(seq[j] == complements[seq[-j-1]] for j in range(i)):
                max_length = 1
            else:
                break
        return max_length
    longest = {'sequence': '', 'helix_length': 0}
    for seq in sequences:
        helix_length = find_longest_helix(seq)
        if helix_length > longest['helix_length']:
            longest = {'sequence': seq, 'helix_length': helix_length}
    return longest
                     

In [163]:
result = longest_helix(df2.Seq)
print(f"Sequence: {result['sequence']}, Helix Length: {result['helix_length']}")
#Wrong answer

Sequence: TATGTAATTGGTTCCAGTGTGAGTCATTAAA, Helix Length: 1


In [165]:
def longest_helix(sequences):
    complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    def find_longest_helix(seq):
        max_length = 0
        n = len(seq)
        for i in range(n):
            match = True
            for j in range(i+1):
                if seq[j] != complements.get(seq[-(i + 1) + j], ''):
                    match = False
                    break
            if match:
                max_length = i + 1
            else:
                break
        return max_length
    longest = {'sequence': '', 'helix_length': 0}
    for seq in sequences:
        helix_length = find_longest_helix(seq)
        if helix_length > longest['helix_length']:
            longest = {'sequence': seq, 'helix_length': helix_length}
    return longest
        

In [166]:
result = longest_helix(df2.Seq)
print(f"Sequence: {result['sequence']}, Helix Length: {result['helix_length']}")
#Wrong answer

Sequence: GGGGACTGCGGAGGCCAGCAGTGAACACCCC, Helix Length: 4


In [179]:
def longest_helix(sequences):
    complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    def find_longest_helix(seq):
        max_length = 0
        n = len(seq)
        for i in range(n):
            match = True
            for j in range(i+1):
                if seq[j] != complements.get(seq[-(i + 1) + j], ''):
                    match = False
                    break
            if match:
                max_length = i + 1
                
        return max_length
    
    longest = {'sequence': '', 'helix_length': 0}
    for seq in sequences:
        helix_length = find_longest_helix(seq)
        if helix_length > longest['helix_length']:
            longest = {'sequence': seq, 'helix_length': helix_length}
    return longest
        

In [180]:
result = longest_helix(df2.Seq)
print(f"Sequence: {result['sequence']}, Helix Length: {result['helix_length']}")
#Wrong answer

Sequence: ATTTTATACCAACAACTGTTTCATCTTAAAA, Helix Length: 5


In [177]:
def longest_helix(sequences):
    complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    def find_longest_helix(seq):
        max_length = 0
        n = len(seq)
        for length in range (1, n // 2 + 1):
            match = True
            for i in range(length):
                if seq[i] != complements.get(seq[-length + i], ''):
                    match = False
                    break
            if match:
                max_length = length
                
        return max_length
    
    longest = {'sequence': '', 'helix_length': 0}
    for seq in sequences:
        helix_length = find_longest_helix(seq)
        if helix_length > longest['helix_length']:
            longest = {'sequence': seq, 'helix_length': helix_length}
    return longest
        

In [178]:
result = longest_helix(df2.Seq)
print(f"Sequence: {result['sequence']}, Helix Length: {result['helix_length']}")
#Wrong answer

Sequence: ATTTTATACCAACAACTGTTTCATCTTAAAA, Helix Length: 5
