# Evaluate modENCODE FPKM of Adult Worms

Map the average expressions from the FPKM data to WormCat Categories and calculate some basic stats.

The question we are asking: 

* __Is the expression levels for Unassigned Genes similar to other Wormcat categories?__

__Answer: Yes__
* Unassigned Genes have a _Mean_ FPKM Expression  of ___28___ and fall in the ___Third quartile___ when compared to all other Wormcat categories
* Unassigned Genes have a _Std_ FPKM Expression of ___180___ and falls in the ___Second quartile___ when compared to all other Wormcat categories

---
Notes:
* There are 492 Genes in WormCat that are not in ModENCODE
* Unique Categories
  * Category 1 has 34 unique categories.
  * Category 2 has 249 unique categories.
  * Category 3 has 471 unique categories.



In [None]:
import pandas as pd


In [None]:
# Read in the Wormcat Catalog
wormcat_df = pd.read_csv('./input_data/whole_genome_v2_nov-11-2021.csv')

In [None]:
wormcat_df

In [None]:
# Read in the Excel Sheet to get the Tab Names
xlsx_file_nm = './input_data/fpkm_adult.xlsx'
fpkm_adult_xlsx = pd.ExcelFile(xlsx_file_nm)

In [None]:
sheet_names = fpkm_adult_xlsx.sheet_names
sheet_names

In [None]:
# Read the 'comp FPKM' Tab
fpkm_df = pd.read_excel(xlsx_file_nm, sheet_name='comp FPKM')

In [None]:
fpkm_df

In [None]:
# Merge the data joining on the Wormbase ID use a Left join as to not drop any wormcat rows
wormcat_w_fpkm_df = pd.merge(wormcat_df, fpkm_df, left_on='Wormbase ID', right_on='Gene.ID', how='left')


In [None]:
# Check how many Wormbase IDs do not have a fpkm from ModENCODE
# there are 492 Genes in WormCat that are not in ModENCODE
missing = wormcat_w_fpkm_df['Gene.ID'].isna()
missing.value_counts()

In [None]:
# OK Now we that we know how many rows are missing data we can drop them so they dont mess up the stats

wormcat_w_fpkm_df = wormcat_w_fpkm_df[~wormcat_w_fpkm_df['Gene.ID'].isna()]
wormcat_w_fpkm_df = wormcat_w_fpkm_df.drop(['Unnamed: 1','Gene.ID'], axis=1)
wormcat_w_fpkm_df.rename(columns={'average adult FPKMs': 'Avg_FPKM','st dev':'Std_FPKM'}, inplace=True)
wormcat_w_fpkm_df

In [None]:
# How many unique cataegory items are there in each main category?
for c in ['Category 1','Category 2','Category 3']:
    category = wormcat_w_fpkm_df[c].value_counts()
    print(f'{c} has {len(category)} unique categories.')

In [None]:
# Calculate some basic stats and Write to an excel sheet
data = {}
for category in ['Category 1','Category 2','Category 3']:
    grouped = wormcat_w_fpkm_df.groupby(category)
    column_nm='Avg_FPKM'
    stat_mean = grouped[column_nm].mean()
    stat_mean.name = 'mean'
    stat_count = grouped[column_nm].count()
    stat_count.name= 'count'
    stat_min = grouped[column_nm].min()
    stat_min.name = 'min'
    stat_max = grouped[column_nm].max()
    stat_max.name = 'max'
    stat_std = grouped[column_nm].std()
    stat_std.name = 'std'
    stats = pd.concat([stat_mean, stat_std, stat_count, stat_min, stat_max], axis=1)
    stats
    data[category]=stats
    mode = 'w' if category=='Category 1' else 'a'
    with pd.ExcelWriter('./output_data/wormcat_modencode.xlsx', mode=mode) as writer:
        stats.to_excel(writer, sheet_name=category)


In [None]:
# write the FULL details to the excel sheet
with pd.ExcelWriter('./output_data/wormcat_modencode.xlsx', mode='a') as writer:
        wormcat_w_fpkm_df.to_excel(writer, sheet_name='full detail', index=False)

In [None]:
grouped = wormcat_w_fpkm_df.groupby('Category 1')
stat = grouped['Avg_FPKM'].std()
stat.sort_values()

In [None]:
grouped = wormcat_w_fpkm_df.groupby('Category 1')
stat = grouped['Avg_FPKM'].mean()
stat.sort_values()

In [None]:
import matplotlib.pyplot as plt
import numpy as np


Category_1_mean = data['Category 1']['mean'].sort_values(ascending=False)
Category_1_mean.index

x = Category_1_mean.index
y = np.log2(Category_1_mean.values)
#y = Category_1_mean.values


fig, ax = plt.subplots(figsize=(10, 5))
# Create bar plot
plt.bar(x, y)

# Set title and labels
plt.title("Modencode FPKM of Adult Worms",fontsize=15)
plt.ylabel("log2(mean FPKM) ")
plt.xlabel("WormCat Category 1")
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
# Show plot
# Adding annotation on the plot.
ax.annotate('UNASSIGNED', xy=(23, 5), xytext=(18, 9), fontsize=12,
            arrowprops=dict(facecolor='green', shrink=0.05))

# only one line may be specified; full height
plt.axvline(x = 8, color = 'black', label = 'axvline - full height')
plt.axvline(x = 16.5, color = 'black', label = 'axvline - full height') 
plt.axvline(x = 25, color = 'black', label = 'axvline - full height') 

ax.text(3, 12, 'Q1', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
ax.text(12, 12, 'Q2', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
ax.text(21, 12, 'Q3', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
ax.text(30, 12, 'Q4', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})

plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np


Category_1_mean = data['Category 1']['std'].sort_values(ascending=False)
Category_1_mean.index

x = Category_1_mean.index
y = np.log2(Category_1_mean.values)
#y = Category_1_mean.values


fig, ax = plt.subplots(figsize=(10, 5))
# Create bar plot
plt.bar(x, y)

# Set title and labels
plt.title("Modencode FPKM of Adult Worms",fontsize=15)
plt.ylabel("log2(std FPKM) ")
plt.xlabel("WormCat Category 1")
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
# Show plot
# Adding annotation on the plot.
ax.annotate('UNASSIGNED', xy=(15, 8), xytext=(10, 11), fontsize=12,
            arrowprops=dict(facecolor='green', shrink=0.05))

# only one line may be specified; full height
plt.axvline(x = 8, color = 'black', label = 'axvline - full height')
plt.axvline(x = 16.5, color = 'black', label = 'axvline - full height') 
plt.axvline(x = 25, color = 'black', label = 'axvline - full height') 

ax.text(3, 15, 'Q1', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
ax.text(12, 15, 'Q2', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
ax.text(21, 15, 'Q3', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
ax.text(30, 15, 'Q4', style='italic', bbox={
        'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})


plt.show()