In [7]:
pip install pandas

Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-macosx_10_9_x86_64.whl (10.2 MB)
     |████████████████████████████████| 10.2 MB 4.6 MB/s            
Collecting pytz>=2017.2
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.5 pytz-2023.3.post1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as scipy
#import glmnet
import glmnet_python
from glmnet_python import glmnet
from glmnetPrint import glmnetPrint; from glmnetCoef import glmnetCoef
from scipy.sparse import csr_matrix
#from glmnet import ElasticNet

In [3]:
#a function that takes two matrices as input and run the glmnet lasso regression

#target_gene: Nx1 matrix of target gene expression information
#expression_data: NxM matrix of cell x gene expression data, corresponding to the index of gene_list
#lambda_value: lambda value (penalty), can be adjusted based on data and initial fit results
#gene_list: Nx1 matrix of a list of gene names
#A:Mx1 matrix of model coefficients (what we're looking for), fitted values by the regression
def glmnet_lasso(target_gene, expression_data, lambda_value, gene_list=None):
    
    #check if parameters are of suitable size matrices
    if len(gene_list) != len(expression_data):
        raise ValueError("Matrix must have the same number of rows.")
    #elif len(gene_list[0]!=1):
        #raise ValueError("Gene matrix contains more than one column.")
    
    #create variables for regression
    # X: NxM gene expressions
    X = expression_data
    # y: Nx1 sparse matrix of the target cells
    y = target_gene
    lambdas = np.logspace(np.log10(1e-4), np.log10(35), 100)

    #regression
    #m = ElasticNet()
    #m = m.fit(X, y)
    fit = glmnet(x = X, y = y, family='gaussian', nlambda=100, alpha=0.2)
    ##glmnetPrint(fit)
    coef_pred = glmnetCoef(fit, s=scipy.float64([lambda_value]), exact=False)

    #decide which model did a better job of fitting and predicting matrix A
    #y_pred = y_pred1 #or y_pred2, depending on which is more optimal


    return coef_pred

In [4]:
import random

In [5]:
#testing data
file_path1 = "/Users/emilyxie/Downloads/data-and-scripts/data/gene_list.txt"
gene_list = pd.read_csv(file_path1, header=None, delim_whitespace=True)
print(gene_list)

file_path2 = "/Users/emilyxie/Downloads/data-and-scripts/data/X_SCODE_data.csv"
expression_data = pd.read_csv(file_path2, header=None)
print(expression_data)
expression_data = expression_data.to_numpy()
X = expression_data[:,0:355]
print(np.shape(X))

target_cells = expression_data[:,355:356]
print(target_cells)
print(np.shape(target_cells))


         0
0     SOX2
1    ESRRB
2     UTF1
3    EPAS1
4    FOXQ1
..     ...
95  POU4F2
96    TFEB
97   DDIT3
98   SALL4
99  ZFP462

[100 rows x 1 columns]
         0         1         2         3         4         5         6    \
0   1.207739  1.131859  1.273929  1.228341  1.207457  1.347742  1.294244   
1   1.309502  1.382944  1.167757  1.278879  1.362136  1.461041  1.200626   
2   0.887411  1.067413  1.554748  1.571531  1.322056  1.638627  1.428033   
3   0.078518  0.374048  0.490467  0.673594  0.558087  0.528945  0.477813   
4   0.000000  0.000000  0.000000  0.000000  0.047418  0.000000  0.000000   
..       ...       ...       ...       ...       ...       ...       ...   
95  0.017869  0.333707  0.425832  0.163460  0.487849  0.230638  0.362528   
96  0.725058  0.476488  0.338249  0.595794  0.781623  0.774960  0.849005   
97  0.077115  0.236291  0.000000  0.110378  0.291968  0.360105  0.394599   
98  1.013059  0.889990  1.133261  0.936444  1.035104  1.158866  1.017105   
99  0.38

In [6]:
coef = glmnet_lasso(target_cells, expression_data, 0.05, gene_list=gene_list, )
print(coef.shape)

(357, 1)
