In [3]:
# %pip install google-generativeai
# %pip install pandas
# %pip install sklearn
# %pip install spacy
# %pip install numpy
# %pip install dotenv

In [4]:
import requests

# URL for Chromosome 1 in FASTA format
url = 'ftp://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.1.fa.gz'

# Download the file
!curl {url} --output chromosome1.fa.gz

# Unzip the file
!gunzip chromosome1.fa.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 66.0M  100 66.0M    0     0  4191k      0  0:00:16  0:00:16 --:--:-- 6274k0     0  3921k      0  0:00:17  0:00:03  0:00:14 3921k


In [5]:
# Read the genome sequence from the FASTA file
with open('chromosome1.fa', 'r') as file:
    # Skip the header line
    next(file)
    # Read the sequence data
    genome_sequence = file.read().replace('\n', '')

In [6]:
token_count = len(genome_sequence)
print(f"Total nucleotides (tokens): {token_count}")

Total nucleotides (tokens): 248956422


In [7]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from .env file
load_dotenv()

# Configure API key from environment variable
api_key = os.getenv('GENAI_API_KEY')
genai.configure(api_key=api_key)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Create the model
generation_config = {
    "temperature": 0.8,
    "top_p": 0.95,
    "max_output_tokens": 5000,
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)

In [9]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load spaCy model for word embeddings
nlp = spacy.load("en_core_web_sm")

# Define a function to calculate semantic similarity
def calculate_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    
    # Convert documents to vectors
    vector1 = doc1.vector
    vector2 = doc2.vector
    
    # Calculate cosine similarity
    similarity = cosine_similarity([vector1], [vector2])[0][0]
    return similarity


In [10]:
# Prompt and response
# Since the sequence is too large, let's take a segment for demonstration
segment_length = 15000  # 15,000 nucleotides
genome_segment = genome_sequence[:segment_length]

# Update the prompt with the segment
prompt = f"""
You are a genomic analyst AI specializing in identifying patterns, mutations, and areas of interest in DNA sequences.

Analyze the following human genome segment from Chromosome 1 and provide insights into:

- Repetitive sequences and their significance.
- Potential gene locations and their functions.
- Common mutations or SNPs (Single Nucleotide Polymorphisms) and their associated diseases.
- Any notable patterns that could be of medical research interest.

Genome Segment:
{genome_segment}
"""

response = model.generate_content(prompt)
documentation = response.text
print("Response:\n", documentation)

# Calculate and print the coherence score
similarity_score = calculate_similarity(prompt, documentation)
print(f"Coherence Score: {similarity_score:.2f}")

# Set a threshold for coherence (you can adjust this based on experimentation)
threshold = 0.7
if similarity_score >= threshold:
    print("The response is coherent with the prompt.")
else:
    print("The response may lack coherence with the prompt. Consider reviewing the output.")


Response:
 ## Analysis of Human Genome Segment from Chromosome 1:

This segment appears to be a complex region containing a mix of repetitive elements, potential gene locations, and areas of interest for medical research.

**1. Repetitive Sequences and Significance:**

* **Alu elements:** Multiple instances of the Alu sequence, a short interspersed nuclear element (SINE), are present. Alu elements are common in the human genome, often associated with:
    * **Chromosomal rearrangements:** Their presence can contribute to deletions, insertions, and translocations.
    * **Gene regulation:** They can act as enhancers or silencers of nearby genes.
    * **Disease susceptibility:** Variations in Alu elements are linked to certain diseases like cancer.
* **Microsatellites:** Several stretches of repetitive sequences like "GG" and "CA" are present. These are called microsatellites, also known as Simple Sequence Repeats (SSRs).
    * **Genetic markers:** Microsatellites are highly polymorphic

In [11]:
# Prompt and response
prompt = "Explain the concept of reinforcement learning."
response = model.generate_content(prompt)
documentation = response.text
print("Response:\n", documentation)

# Calculate and print the coherence score
similarity_score = calculate_similarity(prompt, documentation)
print(f"Coherence Score: {similarity_score:.2f}")

# Set a threshold for coherence (you can adjust this based on experimentation)
threshold = 0.7
if similarity_score >= threshold:
    print("The response is coherent with the prompt.")
else:
    print("The response may lack coherence with the prompt. Consider reviewing the output.")


Response:
 ## Reinforcement Learning: A Guide for the Curious

Imagine you're teaching a dog a new trick. You don't give it instructions, but instead, you reward it with treats whenever it performs the desired action. Over time, the dog learns to associate the action with the reward, and eventually, it masters the trick. This is the basic principle behind **reinforcement learning (RL)**.

**In essence, RL is a type of machine learning where an agent learns to interact with an environment to maximize its rewards.**  Instead of being explicitly programmed, the agent learns through trial and error, guided by feedback from the environment. 

Here's a breakdown of key concepts:

**1. The Agent:** The learner, the entity that interacts with the environment. This could be a robot, a software program, or even a human.

**2. The Environment:** The world the agent interacts with. This can be anything from a simulated game world to a real-world scenario like a stock market.

**3. State:** The cur