# DISpANALYSIS

In [None]:
# from sys import path
# path.append('/Users/esti/Documents/PROYECTOS/3DPROTUCELL/pmoss/pMoSS/pmoss')

In [1]:
# Load the packages needed to run the scripts in this notebook
import numpy as np
import pandas as pd
from pmoss.analysis import compute_diagnosis
from pmoss import create_combination
from pmoss.display import scatterplot_decrease_parameters, plot_pcurve_by_measure, composed_plot, table_of_results
from pmoss.models.exponential_fit import decission_data_exponential
from pmoss.loaders import morphoparam
# Avoid warnings
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'models'

### Information about the data.
Provide path containing the data (csv or excel) and the name of the file. 

Note: The column identifying the group to which each value belongs to, must have the name "Condition" and should be the first column.

In [None]:
# path containing the data
path = '../data/morphology_taxol/'
path = "/Users/esti/Documents/PROYECTOS/3DPROTUCELL/pmoss/pMoSS/data/morphology_taxol"
# Name of the file containing the information. It can be a csv or excel file. 
# Note that the column containing the labels of the group must have the name "Condition" 
# and should be the first column of the file.

# You can read either a csv or excel files:
file_name ='cell_data.xlsx'

### Estimation of the p-value function 

Initialization parameters

In [2]:
# number of "n-values" to evaluate (size of N-grid)
grid_size = 10# 250 
# minimum "n-value" to compute Monte Carlo cross-validation
n0 = 2 
# maximum "n-value" to compute Monte Carlo cross-validation
Nmax = 2500

# This value prevents from having only one iteration for the highest "n-value":
# final iterations = k*(m/min(m,Nmax)) where m is the size of group with less observations. 
k = 20 

# This value prevents from having millions of iterations in n0 (the lowest"n-value"):
# initial iterations = np.log((m/n0)*initial_portion) where m is the size of group with less observations. 
initial_portion=0.001 # 1/15.



Parameters for the calculation of the decision index

In [3]:
alpha = 0.05 # alpha for a 100(1-alpha) statistical significance.
gamma = 5e-06 # gamma in the paper = gamma*alpha.
# Statistitical test to evaluate
test = 'MannWhitneyU'
# Method to estimate the p-value function
method = 'exponential'

Estimation of the p-value function and assesment of the decision index.

In [4]:
pvalues, param, Theta = compute_diagnosis(file_name, path = path, gamma = gamma,
                                          alpha = alpha, grid_size = grid_size,
                                          n0 = n0, Nmax = Nmax,k = k,
                                          initial_portion=initial_portion,
                                          method = method, test = test)

NameError: name 'compute_diagnosis' is not defined

Save the results

In [None]:
# Save computed parameters
pvalues.to_csv('../data/morphology_taxol/cell_morphology_pvalues.csv',index = False)


### Plot of results

In [None]:
# Load the data
path = r'../data/morphology_taxol/'
file_name = r'cell_data.xlsx'
df = pd.read_csv(path + 'cell_morphology_pvalues.csv',sep=',')

# Obtain the data, variables and name of the groups for which you would like to get a plot
data, variables, group_labels = morphoparam(file_name, path=path)

# You can create all the combinations from a dictionary with the labels of each group, or declare which combinations you want:
# 1.- All combinations should be written exactly as in the csv of the p-values.
combination = create_combination(group_labels)

# Calculate the data related to exponential parameters:
param = decission_data_exponential(df, combination, variables, sign_level = 0.05, gamma = 5e-06)

In [None]:
# print the results:
table = table_of_results(param, variables, combination)
table

In [None]:
# Plot exponential parameters a and c from p(n) = aexp(-cn) 
colors = ['#FF0000', '#F89800', '#0200DE']
scatterplot_decrease_parameters(df, combination,variables, path = path,fs = 10, width = 5, height = 5, plot_type="exp-param", colors = colors)

In [None]:
# Plot the estimator of the minimum sample size to observe statistically significant differences.
scatterplot_decrease_parameters(df, combination,variables, path = path,fs = 10, width = 5, height = 5, plot_type="sampled-nalpha", colors = colors)

In [None]:
# Plot the sample size n that satisfies alpha = aexp(-cn). This value is the theoretical minimum sample size needed to observe statistically significant differences.
scatterplot_decrease_parameters(df, combination,variables, path = path,fs = 10, width = 5, height = 5, plot_type="theory-nalpha", colors = colors)

In [None]:
# Plot the p-function for continuous measures
colors = ['#FF0000', '#F89800', '#0200DE']
continuous_variables = {i:variables[i] for i in variables if variables[i]!='protrusion_binary'}
plot_pcurve_by_measure(df, combination, continuous_variables, path = path, colors = colors)

In [None]:
# Plot the p-function for continuous measures
colors = ['#FF0000', '#F89800', '#0200DE']
continuous_variables = {i:variables[i] for i in variables if variables[i]!='protrusion_binary'}
composed_plot(data, df, group_labels, combination, continuous_variables, colors = colors, 
              fs = 20, width = 32, height = 10, bins = 1500)

In [None]:
# Plot the p-function for discrete variables measures
discrete_variables = {'0': 'protrusion_binary'}
test={'0': 'ChiSquared'}
plot_pcurve_by_measure(df, combination, discrete_variables, path = path, test=test, colors = colors)

In [None]:
# Plot the p-function for discrete variables measures
discrete_variables = {'0': 'protrusion_binary'}
test={'0': 'ChiSquared'}
composed_plot(data, df, group_labels, combination, discrete_variables,test=test, 
              colors = colors, fs = 20, width = 30, height = 10, bins = 5)