<a href="https://colab.research.google.com/github/Aleezahshaikh/Bioinformatics_problems/blob/main/Finding_a_Shared_Motif/Finding_a_Shared_Motif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [7]:
from Bio import SeqIO
from google.colab import files

def is_valid_dna(sequence):
    """
    Checks if the sequence contains only valid DNA characters (A, T, C, G).

    Parameters:
        sequence (str): A string representing the DNA sequence.

    Returns:
        bool: True if the sequence contains only valid characters, False otherwise.
    """
    return all(base in 'ATCG' for base in sequence)

def validate_sequence(seq, seq_num=None):
    """
    Validates a DNA sequence by checking its length and validity.

    Parameters:
        seq (str): The DNA sequence to be validated.
        seq_num (int, optional): The sequence number (used for error reporting).

    Returns:
        bool: True if the sequence is valid, False otherwise.
    """
    if len(seq) > 1000:
        print(f"Error: Sequence {seq_num} length exceeds 1000 base pairs.")
        return False
    if not is_valid_dna(seq):
        print(f"Error: Sequence {seq_num} contains invalid DNA characters. Please enter only A, T, C, G.")
        return False
    return True

def find_longest_common_substring(dna_strings):
    """
    Finds the longest common substring among a list of DNA strings.

    Parameters:
        dna_strings (list): A list of DNA sequences (strings).

    Returns:
        str: The longest common substring, or "none" if no common substring exists.
    """
    if not dna_strings:
        print("No DNA sequences found.")
        return ""

    base_string = dna_strings[0]
    max_length = len(base_string)

    for length in range(max_length, 0, -1):
        for start in range(max_length - length + 1):
            substring = base_string[start:start + length]
            if all(substring in seq for seq in dna_strings[1:]):
                return substring
    return "none"

def get_dna_sequences():
    """
    Prompts the user to input DNA sequences manually or upload a FASTA file.
    Validates the input based on specified limits for number and length of sequences.

    Returns:
        list: A list of valid DNA sequences, or an empty list if input is invalid.
    """
    while True:
        choice = input("CHOOSE \n (1) Do you want to input a sequence manually or \n (2) Upload a FASTA file \nEnter 1 or 2: ")

        if choice == '1':
            print("\nExample of valid DNA sequence: ATGCATGC")
            num_sequences = int(input("Enter the number of DNA sequences (up to 100): "))
            if num_sequences > 100:
                print("Error: Maximum number of sequences is 100.")
                continue

            sequences = []
            sequences_valid = True

            for i in range(num_sequences):
                seq = input(f"Enter DNA sequence {i + 1} (up to 1000 bp): ").strip().upper()
                if not validate_sequence(seq, seq_num=i+1):
                    sequences_valid = False
                    break
                sequences.append(seq)

            if sequences_valid:
                return sequences
            else:
                print("\nPlease try entering the sequences again.")
                continue

        elif choice == '2':
            print("\nExample of valid DNA sequence: ATGCATGC")
            print("Please upload a valid FASTA file containing DNA sequences.")
            uploaded = files.upload()
            if not uploaded:
                print("Error: No file uploaded.")
                continue

            uploaded_file = list(uploaded.keys())[0]
            try:
                sequences = [str(record.seq) for record in SeqIO.parse(uploaded_file, "fasta")]
            except Exception as e:
                print(f"Error: Failed to read the FASTA file. {e}")
                continue

            if len(sequences) > 100:
                print("Error: FASTA file contains more than 100 sequences.")
                continue
            if any(len(seq) > 1000 for seq in sequences):
                print("Error: One or more sequences in the file exceed 1000 base pairs.")
                continue
            if any(not is_valid_dna(seq) for seq in sequences):
                print("Error: One or more sequences contain invalid DNA characters.")
                continue

            return sequences

        else:
            print("Invalid choice. Please enter 1 for manual input or 2 for file upload.")
            continue

# Main execution
sequences = get_dna_sequences()

if not sequences:
    print("No sequences found. Please check the input.")
else:
    result = find_longest_common_substring(sequences)
    print("Longest common substring:", result)


CHOOSE 
 (1) Do you want to input a sequence manually or 
 (2) Upload a FASTA file 
Enter 1 or 2: 1

Example of valid DNA sequence: ATGCATGC
Enter the number of DNA sequences (up to 100): 3
Enter DNA sequence 1 (up to 1000 bp): GATTACA
Enter DNA sequence 2 (up to 1000 bp): TAGACCA
Enter DNA sequence 3 (up to 1000 bp): ATACA
Longest common substring: TA
