In [None]:
from libraries import *
from parameters import *
from sklearn import linear_model
from scipy import stats
from scipy.spatial import distance
from scipy.cluster import hierarchy

In [None]:
os.getcwd()
os.chdir(projectDir)

In [None]:
def bayes_cov_col(Y,X,cols,lm):
    """
    @Y    = Expression matrix, cells x x genes, expecting pandas dataframe
    @X    = Covariate matrix, cells x covariates, expecting pandas dataframe
    @cols = The subset of columns that the EM should be performed over, expecting list
    @lm   = linear model object
    """

    #EM iterateit
    Yhat=pd.DataFrame(lm.predict(X))
    Yhat.index=Y.index
    Yhat.columns=Y.columns
    SSE_all=np.square(Y.subtract(Yhat))
    X_adjust=X.copy()


    df_SSE   = []
    df_logit = []

    for curcov in cols:

        curcells=X[X[curcov]>0].index

        if len(curcells)>2:

            X_notcur=X.copy()
            #X_notcur[curcov]=[0]*len(X_notcur)
            X_notcur.loc[:, curcov]=[0]*len(X_notcur)

            X_sub=X_notcur.loc[curcells]

            Y_sub=Y.loc[curcells]

            GENE_var=2.0*Y_sub.var(axis=0)
            vargenes=GENE_var[GENE_var>0].index

            Yhat_notcur=pd.DataFrame(lm.predict(X_sub))
            Yhat_notcur.index=Y_sub.index
            Yhat_notcur.columns=Y_sub.columns

            SSE_notcur=np.square(Y_sub.subtract(Yhat_notcur))
            SSE=SSE_all.loc[curcells].subtract(SSE_notcur)
            SSE_sum=SSE.sum(axis=1)

            SSE_transform=SSE.div(GENE_var+0.5)[vargenes].sum(axis=1)
            logitify=np.divide(1.0,1.0+np.exp(SSE_transform))#sum))

            df_SSE.append(SSE_sum)
            df_logit.append(logitify)

            X_adjust[curcov].loc[curcells]=logitify

    return X_adjust
    

In [None]:
adata = sc.read('outputs/anndata/adata-hash-features_singlets_SingleKO_06292020_PerGENE.h5ad')


In [None]:
df = adata.obs[["n_genes", "mt_frac" ]]

rnaMat = adata.X
rnaMat = pd.DataFrame(rnaMat)
rnaMat.columns = adata.var_names
rnaMat.index = adata.obs_names

regr = linear_model.LinearRegression(fit_intercept = False) 
regr.fit(df, rnaMat)

predicted = regr.predict(df)

residuals = rnaMat-predicted

adata.layers["QCResiduals"] = residuals.copy()

In [None]:
for elem in range(0,10):
    print(elem)
    ad_sub=adata[adata.obs['leiden']==str(elem),:]
    covariates = ad_sub.uns['feature_barcode_names_filtered_GENES']
    covariates = covariates[covariates != "GENE_CONTROL_"]
    X = ad_sub.obs[covariates]
    

    Y = pd.DataFrame(ad_sub.layers["QCResiduals"])
    
    Y.index= ad_sub.obs.index
    Y.columns = ad_sub.var.index
    
    lm = linear_model.LinearRegression()
    lm.fit(np.array(X),np.array(Y))
    B=pd.DataFrame(lm.coef_)

    #Label B with gene names, sort
    #B_out =B.copy()
    B.columns = X.columns
    genenames = pd.Series(list(Y.columns.values))
    B.index = genenames.values
    #B=B.sort_values('betaguide',ascending=True)
    B.to_csv(str(elem)+'_B_coefficients.txt',sep='\t',index=True, header=True)

    EMlist=list(X.columns.values)
    X.to_csv(str(elem)+'_X_EM.txt',sep='\t',index=True, header=True)
    X_adjust = bayes_cov_col(Y,X,EMlist,lm)
    X_adjust.to_csv(str(elem)+'_X_EM_adjust.txt',sep='\t',index=True, header=True)
    