
Ribosome to Drop WBGene00004567


In [None]:
import pandas as pd

In [None]:
!pwd

In [None]:
expr_graph='c_elegans.PRJNA13758.WS287.expr_graph.csv'
clean_str = lambda x: x.replace('"','').strip()
clean_float = lambda x: float(x.replace('"','').strip())

columns = ["Gene", "Gene_name", "Life_stage", "Library", "Protocol", "FPKM_value"]
expr_graph_df = pd.read_csv(expr_graph,low_memory=False, header=None, names=columns,
                            skiprows=1,
                            converters={'Gene_name':clean_str, 'Life_stage':clean_str, 
                                        'Library':clean_str, 'Protocol':clean_str,
                                        'FPKM_value':clean_float})


In [None]:
print(f"{len(expr_graph_df):,}")

In [None]:
expr_graph_df

In [None]:
expr_graph_df['Life_stage'].value_counts()

In [None]:
sams_4 = expr_graph_df.query("Gene == 'WBGene00195017'")
sams_4

In [None]:
classical_stages=['EE','LE','L1','L2','L3','L4','YA']
classical_stages_df = sams_4.query("Life_stage in @classical_stages")
median_df = classical_stages_df.query("Protocol == 'Median'")
ribozero_df = classical_stages_df.query("Protocol == 'ribozero'")
polya_df = classical_stages_df.query("Protocol == 'polyA'")

classical_stages_df = classical_stages_df.sort_values(by=['Life_stage'], ascending=True)
classical_stages_df

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def get_data_dict(data_df):
    classical_stages = ['EE','LE','L1','L2','L3','L4','YA']
    stages_dict = {val:index for index,val in enumerate(classical_stages)}
    swap_dict   = {index:val for index,val in enumerate(classical_stages)}
    
    #columns = ["Gene", "Gene_name", "Life_stage", "Library", "Protocol", "FPKM_value"]
    col_dict = {val:index for index,val in enumerate(data_df.columns)}
    
    
    classical_stages_df = data_df.query("Life_stage in @classical_stages")
    #median_df = classical_stages_df.query("Protocol == 'Median'")
    #ribozero_df = classical_stages_df.query("Protocol == 'ribozero'")
    #polya_df = classical_stages_df.query("Protocol == 'polyA'")
    
    
    
    median_fpkm=[0]*7
    points_ribozero_x = []
    points_ribozero_y = []
    points_polya_x = []
    points_polya_y = []
    
    for index,row in classical_stages_df.iterrows():
        #print(f"{row[col_dict['Life_stage']]=}, {row[col_dict['Protocol']]=}, {row[col_dict['FPKM_value']]=}")
        if row[col_dict['Protocol']] == 'Median':
            median_fpkm[stages_dict[row[col_dict['Life_stage']]]] = row[col_dict['FPKM_value']]
        elif row[col_dict['Protocol']] == 'ribozero':
            points_ribozero_y.append(row[col_dict['FPKM_value']])
            points_ribozero_x.append(row[col_dict['Life_stage']])                        
        elif row[col_dict['Protocol']] == 'polyA':
            points_polya_y.append(row[col_dict['FPKM_value']])
            points_polya_x.append(row[col_dict['Life_stage']])                        
            
            
    ret_val={'median':(classical_stages,median_fpkm),
             'ribozero':(points_ribozero_x,points_ribozero_y),
             'polya':(points_polya_x,points_polya_y)
            }
        
    return ret_val
    
def create_plot_classical_stages(plot_data, gene_name):
    
    x = classical_stages
    y = classical_fpkm

    fig, ax = plt.subplots(figsize=(4, 4))
    #ax.set_ylim([0, 150])
    # Create bar plot
    plt.bar(plot_data['median'][0], plot_data['median'][1], color='lightgrey', width=0.5, zorder=2)
    plt.scatter(plot_data['ribozero'][0], plot_data['ribozero'][1], marker='D', s=10,color='purple',zorder=3)
    plt.scatter(plot_data['polya'][0], plot_data['polya'][1], color='green',s=10,zorder=4)

    # Set title and labels
    plt.title(f"Classical stages {gene_name}")
    plt.ylabel("Expression (FPMK)")
    plt.xlabel("Life stages")
    plt.grid(axis='y', linestyle='-', linewidth=0.5)
    #plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
    # Show plot
    plt.show()


In [None]:
data = get_data_dict(sams_4)
for index, x in enumerate(data['median'][1]):
    print(f"{data['median'][0][index]} = {x:,.2f}")


In [None]:
create_plot_classical_stages(data, "sams-4")

## Look at distributions

In [None]:
expr_graph_df.columns

In [None]:
classical_stages=['EE','LE','L1','L2','L3','L4','YA']
classical_stages_df = expr_graph_df.query("Life_stage in @classical_stages")
classical_stages_median_df = classical_stages_df.query("Protocol == 'Median'")
classical_stages_median_df

In [None]:
classical_stages_EE_df = classical_stages_median_df[classical_stages_median_df['Life_stage'] == 'EE']
classical_stages_EE_df

In [None]:
classical_stages_EE_copy_df = classical_stages_EE_df.copy()
classical_stages_EE_copy_df['FPKM_value_p']= classical_stages_EE_copy_df['FPKM_value']+0.000000001
#classical_stages_EE_copy_df['log2fpkm'] = np.log2(classical_stages_EE_copy_df['FPKM_value_p'])
classical_stages_EE_copy_df['FPKM_value_Whole'] = np.round(classical_stages_EE_copy_df['FPKM_value'],0)

In [None]:
print(classical_stages_EE_copy_df['FPKM_value_Whole'].min())
print(classical_stages_EE_copy_df['FPKM_value_Whole'].max())
classical_stages_EE_copy_df['FPKM_value_Whole'].value_counts()

In [None]:

classical_stages_EE_1_plus = classical_stages_EE_copy_df[classical_stages_EE_copy_df['FPKM_value'] > 1.0]
print(f'{len(classical_stages_EE_1_plus)}')
# plot a histogram of the 'values' column
dist_fpkm = classical_stages_EE_1_plus['FPKM_value_Whole'].value_counts()
dist_fpkm_df = dist_fpkm.to_frame()

x = list(dist_fpkm_df.index)
y = list(dist_fpkm_df.FPKM_value_Whole)
print(f'{len(x)=}')
print(f'{len(y)=}')
#x = [1, 2, 3, 4, 5]
#y = [10, 8, 6, 4, 2]
fig, ax = plt.subplots()
ax.bar(x, y)

#classical_stages_EE_1_plus.hist(column='FPKM_value_Whole',bins=10)

# set plot title and axis labels
plt.title('Histogram of Values')
plt.xlabel('Value')
plt.ylabel('Frequency')
#what = plt.axis('tight')
#ax.set_xticklabels([])

# display the plot
plt.show()

In [None]:
print(classical_stages_EE_1_plus['FPKM_value_Whole'].min())
print(classical_stages_EE_1_plus['FPKM_value_Whole'].max())
classical_stages_EE_1_plus['FPKM_value_Whole'].value_counts()

In [None]:
classical_stages_EE_df.describe()

In [None]:
disease_association_df.index

In [None]:
# calculate the Z-score
mean = np.mean(classical_stages_EE_df['FPKM_value'])
std = np.std(classical_stages_EE_df['FPKM_value'])
classical_stages_EE_df['zscore'] = (classical_stages_EE_df['FPKM_value'] - mean) / std

# identify outliers
outliers = classical_stages_EE_df[abs(classical_stages_EE_df['zscore']) > 1]
print(len(outliers['Gene']))


In [None]:
#list(outliers['Gene'])

In [None]:
#outliers = ['WBGene00004424', 'WBGene00004443', 'WBGene00004453', 'WBGene00004497', 'WBGene00004567', 'WBGene00004677', 'WBGene00011558', 'WBGene00023020', 'WBGene00023068', 'WBGene00044704', 'WBGene00045370', 'WBGene00268212', 'WBGene00305379']

classical_stages_EE_df = classical_stages_EE_df[~classical_stages_EE_df['Gene'].isin(list(outliers['Gene']))]
len(classical_stages_EE_df)

In [None]:
import matplotlib.pyplot as plt

# generate some example data
x = [1, 2, 3, 4, 5]
y = [10, 8, 6, 4, 2]

# create a bar plot
fig, ax = plt.subplots()
ax.bar(x, y)

# hide the x-axis labels
#ax.set_xticklabels([])

# show the plot
plt.show()


In [None]:
dist_fpkm_df

In [None]:
import requests
from bs4 import BeautifulSoup

base_url="https://wormbase.org/species/c_elegans/gene/WBGene00195017"
expression_url = "https://wormbase.org/species/c_elegans/gene/WBGene00008205#0-9f16c324-10-9f6c324/tools/rnaseq/expression_dataset_locator.cgi"
# send a GET request to the web page
response = requests.get(expression_url)

# parse the HTML content of the web page using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

# get the formatted HTML as a string
html_content = soup.prettify()

# write the HTML content to a file
with open('output.html', 'w') as f:
    f.write(html_content)

# find all the links on the page
#links = soup.find_all('a')
my_div = soup.find('div', {'id': 'highcharts-0'})

print(my_div)
# print out the links
#for link in links:
#    print(link.get('href'))


In [None]:
from bs4 import BeautifulSoup

# create a BeautifulSoup object from an HTML file
with open('example.html') as f:
    soup = BeautifulSoup(f, 'html.parser')

# get the formatted HTML as a string
html_content = soup.prettify()

# write the HTML content to a file
with open('output.html', 'w') as f:
    f.write(html_content)


In [None]:
import os
import sys
os.path.dirname(sys.executable)

In [None]:
!pwd
