In [9]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
import matplotlib.pyplot as plt
import scipy as scipy
from sklearn.linear_model import LassoCV

In [12]:
#gene_list: Nx1 matrix of a list of gene names
#target gene: Nx1 matrix of target gene expression information
#expression_data: NxM matrix of cell x gene expression data, corresponding to index of gene_list
#A: Mx1 matrix of model coefficients (what we're looking for), fitted values by the regression
def lasso_reg(gene_list, target_gene, expression_data):
    
    #create variables
    
    X = expression_data
    y = target_gene #Nx1 of target gene's expression
    
    #regression
    #model = linear_model.Lasso()

    #search for best lambda value for data
    cv = model_selection.RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    #grid=dict()
    #grid['alpha'] = np.arange(0, 1, 0.01)  #parameter lambda is named alpha in this model
    #search = model_selection.GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    #search_results = search.fit(X, y)
    model = LassoCV(alphas=np.arange(0,1,0.01), cv=cv, n_jobs=-1)
    model.fit(X,y)

    #print the best lambda (alpha) value
    print('alpha: %f' % model.alpha_)
    
    #print the grid of alphas used for fitting
    print('alphas used for fitting: ', model.alphas_)

    #get the coefficients for the fitted estimator
    coef_pred = model.coef_

    return coef_pred

In [4]:
import random

In [5]:
#create testing data
file_path1 = "/Users/emilyxie/Downloads/data-and-scripts/data/gene_list.txt"
gene_list = pd.read_csv(file_path1, header=None, delim_whitespace=True)
print(gene_list)

file_path2 = "/Users/emilyxie/Downloads/data-and-scripts/data/X_SCODE_data.csv"
expression_data = pd.read_csv(file_path2, header=None)
print(expression_data)
expression_data = expression_data.to_numpy()
X = expression_data[:,0:355]
print(np.shape(X))

target_cells = expression_data[:,355:356]
print(target_cells)
print(np.shape(target_cells))

         0
0     SOX2
1    ESRRB
2     UTF1
3    EPAS1
4    FOXQ1
..     ...
95  POU4F2
96    TFEB
97   DDIT3
98   SALL4
99  ZFP462

[100 rows x 1 columns]
         0         1         2         3         4         5         6    \
0   1.207739  1.131859  1.273929  1.228341  1.207457  1.347742  1.294244   
1   1.309502  1.382944  1.167757  1.278879  1.362136  1.461041  1.200626   
2   0.887411  1.067413  1.554748  1.571531  1.322056  1.638627  1.428033   
3   0.078518  0.374048  0.490467  0.673594  0.558087  0.528945  0.477813   
4   0.000000  0.000000  0.000000  0.000000  0.047418  0.000000  0.000000   
..       ...       ...       ...       ...       ...       ...       ...   
95  0.017869  0.333707  0.425832  0.163460  0.487849  0.230638  0.362528   
96  0.725058  0.476488  0.338249  0.595794  0.781623  0.774960  0.849005   
97  0.077115  0.236291  0.000000  0.110378  0.291968  0.360105  0.394599   
98  1.013059  0.889990  1.133261  0.936444  1.035104  1.158866  1.017105   
99  0.38

In [13]:
lasso_reg(gene_list, target_cells, X)

  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


alpha: 0.010000
alphas used for fitting:  [0.99 0.98 0.97 0.96 0.95 0.94 0.93 0.92 0.91 0.9  0.89 0.88 0.87 0.86
 0.85 0.84 0.83 0.82 0.81 0.8  0.79 0.78 0.77 0.76 0.75 0.74 0.73 0.72
 0.71 0.7  0.69 0.68 0.67 0.66 0.65 0.64 0.63 0.62 0.61 0.6  0.59 0.58
 0.57 0.56 0.55 0.54 0.53 0.52 0.51 0.5  0.49 0.48 0.47 0.46 0.45 0.44
 0.43 0.42 0.41 0.4  0.39 0.38 0.37 0.36 0.35 0.34 0.33 0.32 0.31 0.3
 0.29 0.28 0.27 0.26 0.25 0.24 0.23 0.22 0.21 0.2  0.19 0.18 0.17 0.16
 0.15 0.14 0.13 0.12 0.11 0.1  0.09 0.08 0.07 0.06 0.05 0.04 0.03 0.02
 0.01 0.  ]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


array([-0.        , -0.        , -0.        , -0.        , -0.        ,
        0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.  