In [3]:
import os
import gzip
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd

# Step 1: Read DNA Sequences from all .fa.gz chromosome files
def read_fasta_gz(folder_path):
    genome = {}
    fasta_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.fa.gz')])
    
    for idx, filename in enumerate(fasta_files, start=1):
        chrom_id = str(idx)
        with gzip.open(os.path.join(folder_path, filename), "rt") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                genome[chrom_id] = str(record.seq).upper()
    return genome

genome_folder = "dataset/dataset1/dna_chromosomes/"
genome_data = read_fasta_gz(genome_folder)

# Step 2: Parse all GFF3 files in a folder
def parse_all_gff3(folder_path):
    all_genes = []
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.gff3'):
            chrom_id = filename.split('.')[-2].split('chromosome.')[-1]
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                for line in file:
                    if line.startswith("#"):
                        continue
                    parts = line.strip().split('\t')
                    if len(parts) != 9:
                        continue
                    chrom, _, feature_type, start, end, _, strand, _, attributes = parts
                    if feature_type == "gene":
                        gene_id = None
                        for attr in attributes.split(';'):
                            if attr.startswith("ID=") or attr.startswith("gene_id=") or "GeneID" in attr:
                                gene_id = attr.split('=')[-1]
                                break
                        if gene_id:
                            if gene_id.startswith("gene:"):
                                gene_id = gene_id.replace("gene:", "")
                            all_genes.append({
                                'gene_id': gene_id,
                                'chrom': chrom_id,
                                'start': int(start),
                                'end': int(end),
                                'strand': strand
                            })
    return pd.DataFrame(all_genes)

gff3_folder = "dataset/dataset1/gff3_files/"
gene_annotations = parse_all_gff3(gff3_folder)

# Step 3: Read GEO expression data
def read_geo_expression(geo_file):
    df = pd.read_csv(geo_file, sep='\t')
    df.rename(columns={df.columns[0]: 'gene_id'}, inplace=True)
    return df

geo_file = "dataset/dataset1/geo_file/genes_to_alias_ids.tsv"
geo_data = read_geo_expression(geo_file)

# Step 4: Merge annotations and expression data
merged = pd.merge(gene_annotations, geo_data, on='gene_id')

# Step 5: Add gene DNA sequences to merged data
def extract_sequence(row, genome):
    chrom = row['chrom']
    start = row['start']
    end = row['end']
    strand = row['strand']

    if chrom not in genome:
        return None

    try:
        seq = genome[chrom][start-1:end]
        if strand == '-':
            seq = str(Seq(seq).reverse_complement())
        return seq
    except:
        return None

merged['sequence'] = merged.apply(lambda row: extract_sequence(row, genome_data), axis=1)
merged.dropna(subset=['sequence'], inplace=True)

# Step 6: Save final dataset
merged.to_csv("model.csv", index=False)
print("model.csv saved successfully")

model.csv saved successfully


In [4]:
df=pd.read_csv("model.csv")

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32923 entries, 0 to 32922
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gene_id           32923 non-null  object
 1   chrom             32923 non-null  int64 
 2   start             32923 non-null  int64 
 3   end               32923 non-null  int64 
 4   strand            32923 non-null  object
 5   B73 Zm00001eb.1   32923 non-null  object
 6   Zm00001d027230    32923 non-null  object
 7   AGPv4_Zm00001d.2  32923 non-null  object
 8   sequence          30437 non-null  object
dtypes: int64(3), object(6)
memory usage: 2.3+ MB
None


In [6]:
print(df.isnull().sum())

gene_id                0
chrom                  0
start                  0
end                    0
strand                 0
B73 Zm00001eb.1        0
Zm00001d027230         0
AGPv4_Zm00001d.2       0
sequence            2486
dtype: int64


In [7]:
df.shape

(32923, 9)

In [8]:
df.drop_duplicates(subset='gene_id', inplace=True)


In [9]:

df = df[df['sequence'].notnull() & (df['sequence'] != '')]

df = df[df['sequence'].str.len() > 50]


In [10]:
df['strand'] = df['strand'].map({'+': 1, '-': 0})

In [11]:
import numpy as np
def one_hot_encode(seq):
    mapping = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1],
        'N': [0, 0, 0, 0]
    }
    return [mapping.get(base.upper(), [0, 0, 0, 0]) for base in seq]

# Apply to entire column and store as NumPy arrays
df['onehot_sequence'] = df['sequence'].apply(one_hot_encode)

# Example: Show one-hot of first sequence
print(np.array(df['onehot_sequence'].iloc[0]))

[[0 0 0 1]
 [0 1 0 0]
 [0 0 0 1]
 ...
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 0]]


In [12]:
df['chrom'] = df['chrom'].astype(str)
df['start'] = df['start'].astype(int)
df['end'] = df['end'].astype(int)
df['strand'] = df['strand'].astype(str)
df['sequence'] = df['sequence'].astype(str)

In [13]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,B73 Zm00001eb.1,Zm00001d027230,AGPv4_Zm00001d.2,sequence,onehot_sequence
0,Zm00001eb000020,1,41214,46762,0,B73 Zm00001eb.1,Zm00001d027231,AGPv4_Zm00001d.2,TCTCAGGTTTGAAACAAGCCACAGCTTAATTTCCATACAGTCACTG...,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0,..."
1,Zm00001eb000050,1,108554,114382,0,B73 Zm00001eb.1,Zm00001d027234,AGPv4_Zm00001d.2,CTGCCGAGCAGTGGAGAAGGACCGGCGTCCGGAGGTGGCCGGCGGC...,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."
2,Zm00001eb000060,1,188559,189581,0,B73 Zm00001eb.1,Zm00001d027239,AGPv4_Zm00001d.2,CCATTGCCCTAGCACACGTACGCGCAGATCCGTCCGCTCCAATCCG...,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,..."
3,Zm00001eb000070,1,190192,198832,0,B73 Zm00001eb.1,Zm00001d027240,AGPv4_Zm00001d.2,ATGAAGAGAAAGGAAAACTCCGCGCACTCGGCGTCGCCTCTCAACA...,"[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,..."
4,Zm00001eb000080,1,200262,203393,0,B73 Zm00001eb.1,Zm00001d027242,AGPv4_Zm00001d.2,TTGGACCATCTTTATGCTTTATGGGGCTAAGAAGCATAAGACCTGT...,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."


In [14]:
df.rename(columns={
    'B73 Zm00001eb.1': 'gene_name_b73',
    'Zm00001d027230': 'gene_id_4a',
    'AGPv4_Zm00001d.2': 'gene_id_4b'
}, inplace=True)
df = df.drop(columns=['sequence'])

In [15]:
df.rename(columns={"onehot_sequence":"Sequence"},inplace=True)

In [16]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequence
0,Zm00001eb000020,1,41214,46762,0,B73 Zm00001eb.1,Zm00001d027231,AGPv4_Zm00001d.2,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0,..."
1,Zm00001eb000050,1,108554,114382,0,B73 Zm00001eb.1,Zm00001d027234,AGPv4_Zm00001d.2,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."
2,Zm00001eb000060,1,188559,189581,0,B73 Zm00001eb.1,Zm00001d027239,AGPv4_Zm00001d.2,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,..."
3,Zm00001eb000070,1,190192,198832,0,B73 Zm00001eb.1,Zm00001d027240,AGPv4_Zm00001d.2,"[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,..."
4,Zm00001eb000080,1,200262,203393,0,B73 Zm00001eb.1,Zm00001d027242,AGPv4_Zm00001d.2,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."


In [17]:
max_len = max(len(seq) for seq in df["Sequence"])
print("Maximum sequence length:", max_len)


Maximum sequence length: 751401


In [18]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequence
0,Zm00001eb000020,1,41214,46762,0,B73 Zm00001eb.1,Zm00001d027231,AGPv4_Zm00001d.2,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0,..."
1,Zm00001eb000050,1,108554,114382,0,B73 Zm00001eb.1,Zm00001d027234,AGPv4_Zm00001d.2,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."
2,Zm00001eb000060,1,188559,189581,0,B73 Zm00001eb.1,Zm00001d027239,AGPv4_Zm00001d.2,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,..."
3,Zm00001eb000070,1,190192,198832,0,B73 Zm00001eb.1,Zm00001d027240,AGPv4_Zm00001d.2,"[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,..."
4,Zm00001eb000080,1,200262,203393,0,B73 Zm00001eb.1,Zm00001d027242,AGPv4_Zm00001d.2,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."


In [19]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['gene_id','gene_name_b73', 'gene_id_4a', 'gene_id_4b']
for col in label_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Save or return the cleaned numeric DataFrame
df.to_csv("model.csv", index=False)


In [21]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequence
0,0,1,41214,46762,0,0,12958,0,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0,..."
1,1,1,108554,114382,0,0,12959,0,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."
2,2,1,188559,189581,0,0,12960,0,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,..."
3,3,1,190192,198832,0,0,12961,0,"[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,..."
4,4,1,200262,203393,0,0,12962,0,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0,..."


In [22]:
import pandas as pd
import ast

# If "Sequence" is stored as a string, parse it
df["Sequence"] = df["Sequence"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Mapping from one-hot vector to integer
onehot_map = {
    (0, 0, 0, 1): 0,
    (0, 0, 1, 0): 1,
    (0, 1, 0, 0): 2,
    (1, 0, 0, 0): 3
}

# Conversion function
def onehot_to_int(seq):
    return [onehot_map.get(tuple(vec), 4) for vec in seq]  # 4 for unknown patterns

# Apply conversion
df["Int_Sequence"] = df["Sequence"].apply(onehot_to_int)


In [23]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequence,Int_Sequence
0,0,1,41214,46762,0,0,12958,0,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0,...","[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,...","[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,...","[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,...","[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0,...","[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."


In [24]:
df.drop(columns=["Sequence"])

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Int_Sequence
0,0,1,41214,46762,0,0,12958,0,"[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."
...,...,...,...,...,...,...,...,...,...
32916,27617,9,162278225,162280767,0,0,25923,0,"[2, 0, 3, 0, 1, 1, 3, 0, 0, 0, 0, 0, 1, 3, 2, ..."
32917,27618,9,162498899,162511418,1,0,25924,0,"[0, 3, 1, 0, 2, 3, 2, 0, 3, 2, 2, 3, 3, 1, 3, ..."
32919,27619,9,162552801,162575880,0,0,25925,0,"[1, 2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 0, 1, 2, ..."
32920,27620,9,162707413,162742092,0,0,25926,0,"[0, 1, 1, 1, 3, 0, 3, 2, 2, 3, 3, 3, 2, 3, 1, ..."


In [25]:
df.rename(columns={"Int_Sequence":"Sequences"},inplace=True)

In [26]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequence,Sequences
0,0,1,41214,46762,0,0,12958,0,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0,...","[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,...","[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,...","[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,...","[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0,...","[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."


In [27]:
df.drop(columns=["Sequence"],axis=1)

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequences
0,0,1,41214,46762,0,0,12958,0,"[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."
...,...,...,...,...,...,...,...,...,...
32916,27617,9,162278225,162280767,0,0,25923,0,"[2, 0, 3, 0, 1, 1, 3, 0, 0, 0, 0, 0, 1, 3, 2, ..."
32917,27618,9,162498899,162511418,1,0,25924,0,"[0, 3, 1, 0, 2, 3, 2, 0, 3, 2, 2, 3, 3, 1, 3, ..."
32919,27619,9,162552801,162575880,0,0,25925,0,"[1, 2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 0, 1, 2, ..."
32920,27620,9,162707413,162742092,0,0,25926,0,"[0, 1, 1, 1, 3, 0, 3, 2, 2, 3, 3, 3, 2, 3, 1, ..."


In [33]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequences
0,0,1,41214,46762,0,0,12958,0,"[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."


In [34]:
FIXED_LEN = 50000

def pad_or_truncate(seq, length=FIXED_LEN):
    if len(seq) < length:
        return seq + [0] * (length - len(seq))  # Pad with zeros
    else:
        return seq[:length]  # Truncate

# Apply to your Sequences column
df["Fixed_Sequences"] = df["Sequences"].apply(pad_or_truncate)


In [35]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequences,Fixed_Sequences
0,0,1,41214,46762,0,0,12958,0,"[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ...","[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ...","[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ...","[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ...","[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ...","[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."


In [36]:
df.drop(columns=["Sequences"],axis=1)

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Fixed_Sequences
0,0,1,41214,46762,0,0,12958,0,"[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."
...,...,...,...,...,...,...,...,...,...
32916,27617,9,162278225,162280767,0,0,25923,0,"[2, 0, 3, 0, 1, 1, 3, 0, 0, 0, 0, 0, 1, 3, 2, ..."
32917,27618,9,162498899,162511418,1,0,25924,0,"[0, 3, 1, 0, 2, 3, 2, 0, 3, 2, 2, 3, 3, 1, 3, ..."
32919,27619,9,162552801,162575880,0,0,25925,0,"[1, 2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 0, 1, 2, ..."
32920,27620,9,162707413,162742092,0,0,25926,0,"[0, 1, 1, 1, 3, 0, 3, 2, 2, 3, 3, 3, 2, 3, 1, ..."


In [41]:
df.head()

Unnamed: 0,gene_id,chrom,start,end,strand,gene_name_b73,gene_id_4a,gene_id_4b,Sequences
0,0,1,41214,46762,0,0,12958,0,"[0, 2, 0, 2, 3, 1, 1, 0, 0, 0, 1, 3, 3, 3, 2, ..."
1,1,1,108554,114382,0,0,12959,0,"[2, 0, 1, 2, 2, 1, 3, 1, 2, 3, 1, 0, 1, 1, 3, ..."
2,2,1,188559,189581,0,0,12960,0,"[2, 2, 3, 0, 0, 1, 2, 2, 2, 0, 3, 1, 2, 3, 2, ..."
3,3,1,190192,198832,0,0,12961,0,"[3, 0, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 1, 3, 3, ..."
4,4,1,200262,203393,0,0,12962,0,"[0, 0, 1, 1, 3, 2, 2, 3, 0, 2, 0, 0, 0, 3, 0, ..."


In [40]:
df.rename(columns={"Fixed_Sequences":"Sequences"},inplace=True)

In [43]:
length=[]
for i in range(30):
    seq=df["Sequences"][i]
    len1=len(seq)
    length.append(len1)

In [44]:
print(length)

[50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000]


In [46]:
seq_lengths = []
for i in range(len(df)):
    seq = df.iloc[i]["Sequences"]
    if isinstance(seq, list):
        seq_lengths.append(len(seq))
    else:
        seq_lengths.append(0)


In [49]:
count=0
for seq in seq_lengths:
    if seq>40000:
        count=count+1
print("the count of sequences ",count)

the count of sequences  29849


In [59]:
max_length=max(len(seq1) for seq1 in df["Sequence"] )
print(max_length)

751401


In [51]:
dim=[]
for i in range(130):
    seq=df["Sequences"][i]
    len1=len(seq)
    dim.append(len1)
print(dim)

[50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000]
