In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from collections import Counter
from datetime import datetime

In [3]:
train_sequences=pd.read_csv("kaggledata/train_sequences.csv")
train_labels=pd.read_csv("kaggledata/train_labels.csv")

In [4]:
print(f"Number of training sequences: {len(train_sequences)}")


Number of training sequences: 844


In [5]:
train_sequences.head()


Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
0,1SCL_A,GGGUGCUCAGUACGAGAGGAACCGCACCC,1995-01-26,"THE SARCIN-RICIN LOOP, A MODULAR RNA",>1SCL_1|Chain A|RNA SARCIN-RICIN LOOP|Rattus n...
1,1RNK_A,GGCGCAGUGGGCUAGCGCCACUCAAAAGGCCCAU,1995-02-27,THE STRUCTURE OF AN RNA PSEUDOKNOT THAT CAUSES...,>1RNK_1|Chain A|RNA PSEUDOKNOT|null\nGGCGCAGUG...
2,1RHT_A,GGGACUGACGAUCACGCAGUCUAU,1995-06-03,24-MER RNA HAIRPIN COAT PROTEIN BINDING SITE F...,>1RHT_1|Chain A|RNA (5'-R(P*GP*GP*GP*AP*CP*UP*...
3,1HLX_A,GGGAUAACUUCGGUUGUCCC,1995-09-15,P1 HELIX NUCLEIC ACIDS (DNA/RNA) RIBONUCLEIC ACID,>1HLX_1|Chain A|RNA (5'-R(*GP*GP*GP*AP*UP*AP*A...
4,1HMH_E,GGCGACCCUGAUGAGGCCGAAAGGCCGAAACCGU,1995-12-07,THREE-DIMENSIONAL STRUCTURE OF A HAMMERHEAD RI...,">1HMH_1|Chains A, C, E|HAMMERHEAD RIBOZYME-RNA..."


In [6]:
train_labels.head()


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,1SCL_A_1,G,1,13.76,-25.974001,0.102
1,1SCL_A_2,G,2,9.31,-29.638,2.669
2,1SCL_A_3,G,3,5.529,-27.813,5.878
3,1SCL_A_4,U,4,2.678,-24.900999,9.793
4,1SCL_A_5,G,5,1.827,-20.136,11.793


In [23]:
train_labels["id_"] = train_labels["ID"][:-2]
len(train_labels['id_'].unique())

137094

In [24]:
train_labels

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,id_
0,1SCL_A_1,G,1,13.760,-25.974001,0.102,1SCL_A_1
1,1SCL_A_2,G,2,9.310,-29.638000,2.669,1SCL_A_2
2,1SCL_A_3,G,3,5.529,-27.813000,5.878,1SCL_A_3
3,1SCL_A_4,U,4,2.678,-24.900999,9.793,1SCL_A_4
4,1SCL_A_5,G,5,1.827,-20.136000,11.793,1SCL_A_5
...,...,...,...,...,...,...,...
137090,8Z1F_T_82,U,82,,,,8Z1F_T_82
137091,8Z1F_T_83,C,83,,,,8Z1F_T_83
137092,8Z1F_T_84,A,84,,,,8Z1F_T_84
137093,8Z1F_T_85,U,85,,,,


In [7]:
train_sequences['length'] = train_sequences['sequence'].apply(len)
min_length = train_sequences['length'].min()
max_length = train_sequences['length'].max()
avg_length = train_sequences['length'].mean()

In [8]:
print(f"Sequence length statistics:")
print(f"- Minimum: {min_length}")
print(f"- Maximum: {max_length}")
print(f"- Average: {avg_length:.2f}")

Sequence length statistics:
- Minimum: 3
- Maximum: 4298
- Average: 162.43


In [9]:
fig = px.histogram(
    train_sequences,
    x='length',
    nbins=30,
    title='Distribution of RNA Sequence Lengths',
    labels={'length': 'Sequence Length', 'count': 'Count'},
    template='plotly_white'
)

# Customize the layout
fig.update_layout(
    width=900,
    height=600,
    title_font_size=20,
    xaxis_title_font_size=16,
    yaxis_title_font_size=16,
    xaxis_title='Sequence Length',
    yaxis_title='Count'
)

# Add grid lines for better readability
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# Customize histogram appearance
fig.update_traces(
    marker_color='royalblue',
    marker_line_color='darkblue',
    marker_line_width=1.5,
    opacity=0.75
)

# Show the plot
fig.show()

In [10]:
all_nucleotides = ''.join(train_sequences['sequence'].tolist())
nucleotide_counts = Counter(all_nucleotides)
total_nucleotides = sum(nucleotide_counts.values())

In [11]:
print("\nNucleotide distribution:")
for nucleotide, count in nucleotide_counts.most_common():
    percentage = (count / total_nucleotides) * 100
    print(f"{nucleotide}: {count} ({percentage:.2f}%)")


Nucleotide distribution:
G: 41450 (30.23%)
C: 33937 (24.75%)
A: 32524 (23.72%)
U: 29178 (21.28%)
-: 4 (0.00%)
X: 2 (0.00%)


In [12]:
nucleotide_df = pd.DataFrame(nucleotide_counts.items(), columns=['Nucleotide', 'Count'])
nucleotide_df['Percentage'] = (nucleotide_df['Count'] / total_nucleotides) * 100

# Define colors for nucleotides (standard biological coloring)
nucleotide_colors = {
    'A': '#32CD32',  # Green
    'U': '#FFD700',  # Gold
    'G': '#4169E1',  # Blue
    'C': '#FF6347',  # Red
    '-': '#808080',  # Gray
    'X': '#9370DB'   # Purple
}

# Create the bar plot
fig_bar = px.bar(
    nucleotide_df, 
    x='Nucleotide', 
    y='Count', 
    text='Percentage',
 #   labels={'count': 'Count', 'nucleotide': 'Nucleotide'},
    title='Nucleotide Distribution',
    color='Nucleotide',
    color_discrete_map=nucleotide_colors,
    template='plotly_white'
)

# Format the text to show percentages
fig_bar.update_traces(
    texttemplate='%{text:.2f}%', 
    textposition='outside'
)

# Set the size of the bar chart
fig_bar.update_layout(
    title_font_size=24,
    width=800,
    height=600,
    font=dict(size=16)
)



# Create the pie chart (filter out very small values for better visualization)
nucleotide_df_filtered = nucleotide_df[nucleotide_df['Percentage'] > 0.1]

fig_pie = px.pie(
    nucleotide_df_filtered, 
    values='Percentage', 
    names='Nucleotide',
    title='Nucleotide Distribution (%)',
    color='Nucleotide',
    color_discrete_map=nucleotide_colors,
    template='plotly_white'
)

# Update pie chart text format
fig_pie.update_traces(
    textinfo='label+percent', 
    textposition='inside',
    insidetextorientation='radial'
)

# Increase the size of the pie chart
fig_pie.update_layout(
    width=800,    
    height=600,  
    title_font_size=24,  
    font=dict(size=16)   
)


# Display plots one below the other
fig_bar.show()
fig_pie.show()


In [13]:
train_sequences['date'] = pd.to_datetime(train_sequences['temporal_cutoff'])
min_date = train_sequences['date'].min()
max_date = train_sequences['date'].max()
print(f"\nTemporal range: {min_date.date()} to {max_date.date()}")


Temporal range: 1995-01-26 to 2024-12-18


In [14]:
# Calculate year counts
year_counts = train_sequences['date'].dt.year.value_counts().sort_index()

# Convert to DataFrame for Plotly
year_df = pd.DataFrame({'Year': year_counts.index, 'Count': year_counts.values})

# Create bar chart using Plotly Express
fig = px.bar(
    year_df,
    x='Year',
    y='Count',
    title='Number of RNA Structures by Year',
    labels={'Year': 'Year', 'Count': 'Count'},
    template='plotly_white'
)

# Improve layout
fig.update_layout(
    width=900,
    height=600,
    title_font_size=20,
    xaxis_title_font_size=16,
    yaxis_title_font_size=16
)

# Format x-axis to ensure years display as integers without decimal points
fig.update_xaxes(
    type='category',  # Treat years as categories to maintain order
    tickmode='linear'  # Show all years
)

# Add grid lines for y-axis only
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

# Customize bar appearance
fig.update_traces(
    marker_color='royalblue',
    marker_line_color='darkblue',
    marker_line_width=1.5
)

# Show the plot
fig.show()


In [15]:
pdb_ids = set([target_id.split('_')[0] for target_id in train_sequences['target_id']])
print(f"Number of unique PDB IDs: {len(pdb_ids)}")

Number of unique PDB IDs: 735
