# Exercise 4.2: Computing things!

In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv('data/c_elegans_egg_xa.csv', comment='#')

xa_high = df.loc[df['food']=='high', 'area (sq. um)'].values
xa_low = df.loc[df['food']=='low', 'area (sq. um)'].values

In [29]:
df

Unnamed: 0,food,area (sq. um)
0,high,1683
1,high,2061
2,high,1792
3,high,1852
4,high,2091
...,...,...
96,low,2143
97,low,2252
98,low,2222
99,low,2121


In [31]:
def xa_to_diameter(xa):
    # Calculate the diameters using the formula A = πd^2/4 => d^2 = 4A/π => d = sqrt(4A/π)
    diameters = np.sqrt(4 * np.array(xa) / np.pi)
    return diameters

cross_sectional_areas = df['area (sq. um)'].values
diameters = xa_to_diameter(cross_sectional_areas)
print(diameters)

[46.29105911 51.22642581 47.76657057 48.5596503  51.59790585 47.61973991
 49.33998388 47.89966242 47.21697198 46.94654036 49.08125119 49.84064959
 47.9926071  46.29105911 47.69988539 48.40207395 48.15152345 49.3141717
 49.57168871 47.87307365 48.30991705 46.29105911 46.12573337 46.24978308
 46.41466697 47.87307365 48.15152345 48.95137203 45.72372833 47.18999856
 46.68817945 45.98750791 46.53794651 52.2111661  48.70364742 47.23045291
 47.06842687 46.81073869 45.97366251 49.57168871 50.8397116  48.54653847
 52.08909166 48.24398292 48.40207395 51.58556628 52.55146594 50.31103472
 53.06982074 54.57203767 50.32368681 52.24773281 53.99739399 49.44309786
 53.87936676 47.9926071  52.41804019 47.87307365 52.11352942 51.21399674
 52.44232467 50.47526453 50.8397116  51.56087828 49.84064959 55.96578669
 50.72688754 50.58864976 52.18677405 52.44232467 51.78264653 52.57568879
 51.86863366 52.67246879 49.05530287 52.67246879 50.72688754 50.07003758
 52.32078957 49.18490759 53.72554372 46.67454189 49.

# Exercise 4.3: Working with two-dimensional arrays

In [6]:
import pandas as pd
import numpy as np

In [7]:
A = np.array(
    [
        [6.7, 1.3, 0.6, 0.7],
        [0.1, 5.5, 0.4, 2.4],
        [1.1, 0.8, 4.5, 1.7],
        [0.0, 1.5, 3.4, 7.5],
    ]
)

b = np.array([1.1, 2.3, 3.3, 3.9])

In [8]:
A[:1:]

array([[6.7, 1.3, 0.6, 0.7]])

In [28]:
A[0:,:3:2]

array([[6.7, 0.6],
       [0.1, 0.4],
       [1.1, 4.5],
       [0. , 3.4]])

In [10]:
A[A > 2]

array([6.7, 5.5, 2.4, 4.5, 3.4, 7.5])

In [13]:
np.diag(A)

array([6.7, 5.5, 4.5, 7.5])

In [15]:
x = np.linalg.solve(A, b)

In [16]:
np.dot(A, x)

array([1.1, 2.3, 3.3, 3.9])

In [17]:
np.transpose(A)

array([[6.7, 0.1, 1.1, 0. ],
       [1.3, 5.5, 0.8, 1.5],
       [0.6, 0.4, 4.5, 3.4],
       [0.7, 2.4, 1.7, 7.5]])

In [18]:
np.linalg.inv(A)

array([[ 0.15267508, -0.03365026, -0.01778   ,  0.00054854],
       [-0.00906001,  0.19788853,  0.03719385, -0.07090934],
       [-0.04391535, -0.0144834 ,  0.26880108, -0.05219479],
       [ 0.02172029, -0.0330119 , -0.12929526,  0.17117684]])

In [None]:
A_inv at A

In [19]:
A

array([[6.7, 1.3, 0.6, 0.7],
       [0.1, 5.5, 0.4, 2.4],
       [1.1, 0.8, 4.5, 1.7],
       [0. , 1.5, 3.4, 7.5]])

In [23]:
B = np.ravel(A)

In [24]:
B

array([6.7, 1.3, 0.6, 0.7, 0.1, 5.5, 0.4, 2.4, 1.1, 0.8, 4.5, 1.7, 0. ,
       1.5, 3.4, 7.5])

In [27]:
np.reshape(B, (4,4))

array([[6.7, 1.3, 0.6, 0.7],
       [0.1, 5.5, 0.4, 2.4],
       [1.1, 0.8, 4.5, 1.7],
       [0. , 1.5, 3.4, 7.5]])

# Exercise 4.4: Understanding and building ECDFs

In [29]:
import numpy as np
import pandas as pd

import bokeh.io
import bokeh.plotting

bokeh.io.output_notebook()

In [147]:
%load_ext blackcellmagic

In [149]:
# Dummy data set as a Pandas Series
rg = np.random.default_rng()
data = pd.Series(rg.normal(0, 1, size=100))

# Compute y-values for ECDF
ecdf_y = data.rank(method="first") / len(data)

# Make the plot
p = bokeh.plotting.figure(
    frame_height=200,
    frame_width=300,
    x_axis_label="x",
    y_axis_label="ECDF",
)

p.circle(data, ecdf_y)

bokeh.io.show(p)

In [34]:
def ecdf_vals(data):
    """
    Compute x and y values for a dot-style ECDF.

    Parameters:
        data (numpy.ndarray): One-dimensional NumPy array containing the data.

    Returns:
        tuple: A tuple containing two NumPy arrays, (x, y), where x represents
               the sorted data values and y represents the corresponding ECDF values.
    """
    # Sort the data in ascending order
    sorted_data = np.sort(data)

    # Calculate the ECDF values
    n = len(data)
    y = np.arange(1, n + 1) / n

    return sorted_data, y


In [35]:
ecdf_vals(data)

(array([-3.32799738, -2.2440383 , -1.86038271, -1.69423094, -1.66424718,
        -1.63055212, -1.5844356 , -1.5028484 , -1.43788873, -1.34637932,
        -1.32877118, -1.32394944, -1.30628394, -1.23537917, -1.23409289,
        -1.13787762, -1.05574927, -0.96553239, -0.96306447, -0.88483457,
        -0.79420422, -0.77593016, -0.72393614, -0.70599297, -0.68375221,
        -0.66244994, -0.6258163 , -0.51435136, -0.51022178, -0.50090345,
        -0.44559907, -0.43026903, -0.3879499 , -0.38580083, -0.29931923,
        -0.2850796 , -0.27413633, -0.20921551, -0.1840057 , -0.11921152,
        -0.11334938, -0.08155799, -0.071453  , -0.06392819,  0.025769  ,
         0.02607058,  0.06529492,  0.06886815,  0.09160145,  0.11691714,
         0.12753986,  0.1925493 ,  0.22654112,  0.24288829,  0.25285468,
         0.26474261,  0.26818546,  0.27688693,  0.30666117,  0.31673172,
         0.3414398 ,  0.39562517,  0.40387195,  0.41144445,  0.43127532,
         0.44836701,  0.45614233,  0.48027682,  0.4

# Exercise 4.1: Long-term trends in hybridization of Darwin finches

In [37]:
import pandas as pd
import numpy as np

In [43]:
ls

1FAG.pdb
1J6Z.pdb
1OLG.pdb
20160804_wt_O2_HG104_0uMIPTG.csv
2ERK.pdb
96_well.csv
[36mHG105_images[m[m/
aligned.fasta
[36mbacterial_growth[m[m/
bcd_gradient.csv
beak_depth_scandens_1975.csv
beak_depth_scandens_2012.csv
bee_sperm.csv
bee_weight.csv
bsub_100x_cfp.tif
bsub_100x_phase.tif
c_elegans_egg_xa.csv
collins_switch.csv
fisher_iris.csv
frog_tongue_adhesion.csv
genbank_seq.txt
gfmt_sleep.csv
grant_1973.csv
grant_1975.csv
grant_1987.csv
grant_1991.csv
grant_2012.csv
grant_complete.csv
grant_heredity.csv
lambda_phage.txt
[36mleica_tiffs[m[m/
q18a_lac.csv
q18m_lac.csv
reeves_gradient_width_various_methods.csv
retina_spikes.csv
salmonella_spi1_region.fna
wt_lac.csv
xa_high_food.csv
xa_low_food.csv


In [104]:
# read the CSV file
grant_1973 = pd.read_csv("grant_1973.csv", comment = '#')

# have a look at the data
grant_1973

Unnamed: 0,band,species,yearband,beak length,beak depth
0,20123,fortis,73,9.25,8.05
1,20126,fortis,73,11.35,10.45
2,20128,fortis,73,10.15,9.55
3,20129,fortis,73,9.95,8.75
4,20133,fortis,73,11.55,10.15
...,...,...,...,...,...
84,20224,scandens,73,15.65,9.95
85,20245,scandens,73,14.05,9.55
86,20254,scandens,73,13.85,9.15
87,20259,scandens,73,14.95,10.45


In [105]:
year=1973

grant_1973['year']=year
grant_1973.rename(columns={"beak length": "beak length(mm)"}, inplace=True)
grant_1973.rename(columns={"beak depth": "beak depth(mm)"}, inplace=True)
del grant_1973['yearband']

grant_1973

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,20123,fortis,9.25,8.05,1973
1,20126,fortis,11.35,10.45,1973
2,20128,fortis,10.15,9.55,1973
3,20129,fortis,9.95,8.75,1973
4,20133,fortis,11.55,10.15,1973
...,...,...,...,...,...
84,20224,scandens,15.65,9.95,1973
85,20245,scandens,14.05,9.55,1973
86,20254,scandens,13.85,9.15,1973
87,20259,scandens,14.95,10.45,1973


In [95]:
grant_1975 = pd.read_csv("grant_1975.csv", comment = '#')
grant_1975

Unnamed: 0,band,species,"Beak length, mm","Beak depth, mm"
0,2,fortis,9.40,8.00
1,9,fortis,9.20,8.30
2,12,fortis,9.50,7.50
3,15,fortis,9.50,8.00
4,305,fortis,11.50,9.90
...,...,...,...,...
398,20225,scandens,14.55,9.45
399,20252,scandens,14.05,9.05
400,20255,scandens,14.45,8.75
401,20266,scandens,15.05,9.45


In [106]:
year=1975

grant_1975['year']=year
grant_1975.rename(columns={"Beak length, mm": "beak length(mm)"}, inplace=True)
grant_1975.rename(columns={"Beak depth, mm": "beak depth(mm)"}, inplace=True)
grant_1975

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,2,fortis,9.40,8.00,1975
1,9,fortis,9.20,8.30,1975
2,12,fortis,9.50,7.50,1975
3,15,fortis,9.50,8.00,1975
4,305,fortis,11.50,9.90,1975
...,...,...,...,...,...
398,20225,scandens,14.55,9.45,1975
399,20252,scandens,14.05,9.05,1975
400,20255,scandens,14.45,8.75,1975
401,20266,scandens,15.05,9.45,1975


In [96]:
grant_1987 = pd.read_csv("grant_1987.csv", comment = '#')
grant_1987

Unnamed: 0,band,species,"Beak length, mm","Beak depth, mm"
0,14613,fortis,9.10,7.00
1,15487,fortis,9.14,7.12
2,15187,fortis,9.24,7.21
3,15284,fortis,9.20,7.30
4,14983,fortis,8.83,7.32
...,...,...,...,...
938,14847,scandens,15.30,10.10
939,15886,scandens,11.24,10.11
940,14877,scandens,15.20,10.27
941,8036,scandens,15.44,10.34


In [107]:
year=1987

grant_1987['year']=year
grant_1987.rename(columns={"Beak length, mm": "beak length(mm)"}, inplace=True)
grant_1987.rename(columns={"Beak depth, mm": "beak depth(mm)"}, inplace=True)
grant_1987

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,14613,fortis,9.10,7.00,1987
1,15487,fortis,9.14,7.12,1987
2,15187,fortis,9.24,7.21,1987
3,15284,fortis,9.20,7.30,1987
4,14983,fortis,8.83,7.32,1987
...,...,...,...,...,...
938,14847,scandens,15.30,10.10,1987
939,15886,scandens,11.24,10.11,1987
940,14877,scandens,15.20,10.27,1987
941,8036,scandens,15.44,10.34,1987


In [97]:
grant_1991 = pd.read_csv("grant_1991.csv", comment = '#')
grant_1991

Unnamed: 0,band,species,blength,bdepth
0,2639,fortis,10.30,8.95
1,2666,fortis,12.81,9.30
2,2753,fortis,10.89,10.35
3,2776,fortis,11.30,10.00
4,4229,fortis,10.05,8.62
...,...,...,...,...
616,17947,scandens,11.80,7.90
617,17950,scandens,10.30,8.70
618,17953,scandens,11.10,9.50
619,17956,scandens,13.70,9.00


In [108]:
year=1991

grant_1991['year']=year
grant_1991.rename(columns={"blength": "beak length(mm)"}, inplace=True)
grant_1991.rename(columns={"bdepth": "beak depth(mm)"}, inplace=True)
grant_1991

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,2639,fortis,10.30,8.95,1991
1,2666,fortis,12.81,9.30,1991
2,2753,fortis,10.89,10.35,1991
3,2776,fortis,11.30,10.00,1991
4,4229,fortis,10.05,8.62,1991
...,...,...,...,...,...
616,17947,scandens,11.80,7.90,1991
617,17950,scandens,10.30,8.70,1991
618,17953,scandens,11.10,9.50,1991
619,17956,scandens,13.70,9.00,1991


In [98]:
grant_2012 = pd.read_csv("grant_2012.csv", comment = '#')
grant_2012

Unnamed: 0,band,species,blength,bdepth
0,19022,fortis,10.0,8.5
1,19028,fortis,12.5,8.9
2,19032,fortis,9.3,7.5
3,19041,fortis,10.3,9.6
4,19044,fortis,11.0,9.2
...,...,...,...,...
243,21295,scandens,14.2,9.3
244,21297,scandens,13.0,9.8
245,21340,scandens,14.6,8.9
246,21342,scandens,13.1,9.8


In [109]:
year=2012

grant_2012['year']=year
grant_2012.rename(columns={"blength": "beak length(mm)"}, inplace=True)
grant_2012.rename(columns={"bdepth": "beak depth(mm)"}, inplace=True)
grant_2012

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,19022,fortis,10.0,8.5,2012
1,19028,fortis,12.5,8.9,2012
2,19032,fortis,9.3,7.5,2012
3,19041,fortis,10.3,9.6,2012
4,19044,fortis,11.0,9.2,2012
...,...,...,...,...,...
243,21295,scandens,14.2,9.3,2012
244,21297,scandens,13.0,9.8,2012
245,21340,scandens,14.6,8.9,2012
246,21342,scandens,13.1,9.8,2012


In [100]:
grant_1973 = grant_1973.rename(columns={"yearband": "year"})

In [85]:
grant_1973 = grant_1973.replace('73', '1973') #does not work

In [92]:
grant_1973['year'] = grant_1973['year'].replace(['73'], '1973') #does not work

In [110]:
df_all=pd.concat([grant_1973, grant_1975, grant_1987, grant_1991, grant_2012])
df_all

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,20123,fortis,9.25,8.05,1973
1,20126,fortis,11.35,10.45,1973
2,20128,fortis,10.15,9.55,1973
3,20129,fortis,9.95,8.75,1973
4,20133,fortis,11.55,10.15,1973
...,...,...,...,...,...
243,21295,scandens,14.20,9.30,2012
244,21297,scandens,13.00,9.80,2012
245,21340,scandens,14.60,8.90,2012
246,21342,scandens,13.10,9.80,2012


In [116]:
grant_clean = df_all.drop_duplicates(subset=['band', 'year'])

In [117]:
grant_clean

Unnamed: 0,band,species,beak length(mm),beak depth(mm),year
0,20123,fortis,9.25,8.05,1973
1,20126,fortis,11.35,10.45,1973
2,20128,fortis,10.15,9.55,1973
3,20129,fortis,9.95,8.75,1973
4,20133,fortis,11.55,10.15,1973
...,...,...,...,...,...
243,21295,scandens,14.20,9.30,2012
244,21297,scandens,13.00,9.80,2012
245,21340,scandens,14.60,8.90,2012
246,21342,scandens,13.10,9.80,2012


In [139]:
import iqplot

df = pd.read_csv("grant_complete.csv")

p = iqplot.stripbox(
    data=df,
    q="beak depth (mm)",
    cats=['species'],
    spread = 'jitter',
    color_column='species',
)

bokeh.io.show(p)

In [140]:
import iqplot

df = pd.read_csv("grant_complete.csv")

p = iqplot.strip(
    data=df,
    q="beak depth (mm)",
    cats=['species', 'year'],
    spread = 'jitter',
    color_column='year',
)

bokeh.io.show(p)

In [146]:
import iqplot

df = pd.read_csv("grant_complete.csv")

p = iqplot.stripbox(
    data=df,
    q="beak depth (mm)",
    cats=['year','species'],
    spread = 'jitter',
    color_column='year',
)

bokeh.io.show(p)