In [1]:
import numpy as np
import pandas as pd
import itertools

In [2]:
da = pd.read_parquet('data/de_train.parquet') #  data_de_train

#### create 2 dim matrix which has the n,m,l in column: n: drugs; m: genes; l: cell types

In [3]:
columns = da.columns
selected_columns = columns[:2].append(columns[5:])
# selected_columns
# new matrix
Z_2D = da.loc[:,selected_columns]

#### change the 2-dim matrix Z_2D to a 3-dim tensor

In [4]:
# Getting unique values
sm_names = Z_2D["sm_name"].unique()
cell_types = Z_2D["cell_type"].unique()

# Generating all combinations
combinations = pd.DataFrame(list(itertools.product(cell_types, sm_names)), columns=["cell_type", "sm_name"])


# Merging the combinations with Z_2D
result = pd.merge(combinations, Z_2D, on=['cell_type', 'sm_name'], how='outer')

In [5]:
cell_type_order = result['cell_type'].drop_duplicates().sort_values().tolist()
cell_type_order

['B cells',
 'Myeloid cells',
 'NK cells',
 'T cells CD4+',
 'T cells CD8+',
 'T regulatory cells']

In [6]:
sm_name_order = result['sm_name'].drop_duplicates().sort_values().tolist()
# sm_name_order 

In [7]:
# Sorting by 'sm_name' and 'cell_type'
result_sorted = result.sort_values(by=['cell_type', 'sm_name'])


# Dropping or setting 'sm_name' and 'cell_type' as index
result_sorted = result_sorted.set_index(['cell_type', 'sm_name'])

# Converting to 3D numpy array
tensor_3D = result_sorted.values.reshape(len(sm_names), -1, len(cell_types))

# Note: The '-1' in the reshape function means that this dimension is inferred from the length of the array.


In [8]:
def matricize(R, mode):
    if mode == 1:
        return R.reshape(R.shape[0], -1)
    elif mode == 2:
        return np.moveaxis(R, 1, 0).reshape(R.shape[1], -1)
    else:
        raise ValueError("Mode must be either 1 or 2")

In [9]:
def nan_positions(matrix):

    nan_positions = np.argwhere(np.isnan(matrix))
    
    return nan_positions

nan_positions_tensor = nan_positions(tensor_3D)

In [10]:
# def fill_nan_randomly(matrix):
#     # 获取所有非nan的值
#     non_nan_values = matrix[~np.isnan(matrix)]
    
# #     # 获取所有nan的位置
# #     nan_positions = np.argwhere(np.isnan(matrix))
    
#     # 对每一个nan的位置，随机选择一个非nan的值进行填充
#     for pos in nan_positions_tensor:
#         matrix[tuple(pos)] = np.random.choice(non_nan_values)
    
#     return matrix

# warm_start = fill_nan_randomly(tensor_3D)

In [11]:
def fill_zero_randomly(matrix):
    # 获取所有nan的位置
    nan_positions = np.argwhere(np.isnan(matrix))
    
    # 对每一个nan的位置，随机选择一个非nan的值进行填充
    for pos in nan_positions:
        matrix[tuple(pos)] = 0
    
    return matrix
warm_start = fill_zero_randomly(tensor_3D)
warm_start.shape

(146, 18211, 6)

#### renew slice alg

In [12]:
def truncate_svd(matrix, r):
    # 使用numpy进行SVD分解
    U, Sigma, Vt = np.linalg.svd(matrix, full_matrices=False)

    # 取前r个特征向量
    U_r = U[:, :r]
    Sigma_r = np.diag(Sigma[:r])
    Vt_r = Vt[:r, :]
    
    return U_r

In [13]:
# def tensor_completion_als(Z, r, T):
#     n, m, l = Z.shape
#     U = np.random.rand(n, r)
#     V = np.random.rand(m, r)
#     S = np.array([np.eye(r) for _ in range(l)])
    
#     Z_hat = Z

#     for t in range(T):
#         # Fix V, update U
#         Z_1 = matricize(Z_hat, 1)
#         Z_2 = matricize(Z_hat, 2)
        
        
#         U_full, Sigma, Vt_full = np.linalg.svd(Z_1 , full_matrices=False)
#         U = U_full[:, :r]   
#         print("U")
#         print(U.shape)
#         # Fix U, update V
#         U_full, Sigma, Vt_full = np.linalg.svd(Z_2 , full_matrices=False)
#         V = U_full[:, :r]  
#         print("V")
#         print(V.shape)
        
#         print(Z.shape)
#         # Fix U and V, update S_k for each k
#         for k in range(l):
#             S[k] = U.T @ Z_hat[:, :, k] @ V
        
#         # Calculate Z_hat
        
#         for k in range(l):
#             Z_hat[:, :, k] = U @ S[k] @ V.T
    
#     return Z_hat

In [14]:
# def tensor_completion_als(Z, r, T):
#     n, m, l = Z.shape
#     U = np.random.rand(n, r)
#     V = np.random.rand(m, r)
#     S = np.array([np.eye(r) for _ in range(l)])
    
#     for t in range(T):
#         # Fix V, update U
#         for i in range(n):
#             VVT = V.T @ V
#             for k in range(l):
#                 U[i, :] = np.linalg.solve(VVT + np.eye(r), Z[i, :, k] @ V)
                
#         # Fix U, update V
#         for j in range(m):
#             UUT = U.T @ U
#             for k in range(l):
#                 V[j, :] = np.linalg.solve(UUT + np.eye(r), Z[:, j, k] @ U)
                
#         # Fix U and V, update S_k for each k
#         for k in range(l):
#             S[k] = U.T @ Z[:, :, k] @ V
        
#         # Calculate Z_hat
#         Z_hat = np.zeros((n, m, l))
#         for k in range(l):
#             Z_hat[:, :, k] = U @ S[k] @ V.
    
#     return Z_hat

In [15]:
def tensor_completion_als(Z, r, T):
    n, m, l = Z.shape
    U = np.random.rand(n, r)
    V = np.random.rand(m, r)
    S = np.array([np.eye(r) for _ in range(l)])
    
    Z_hat = Z

    for t in range(T):
        print(t)
        # Fix V, update U
        Z_1 = matricize(Z_hat, 1)
        Z_2 = matricize(Z_hat, 2)
           
        U_full, Sigma, Vt_full = np.linalg.svd(Z_1 , full_matrices=False)
        U = U_full[:, :r]   

        # Fix U, update V
        U_full, Sigma, Vt_full = np.linalg.svd(Z_2 , full_matrices=False)
        V = U_full[:, :r]  

        # Fix U and V, update S_k for each k
        for k in range(l):
            S[k] = U.T @ Z_hat[:, :, k] @ V
        
        # Calculate Z_hat
        new_Z = U @ S[k] @ V.T
        for pos in nan_positions_tensor:

            if tuple(pos)[2] == k:
                Z_hat[tuple(pos)] = new_Z[tuple(pos)[0:2]]
    
    return Z_hat



In [16]:
completed_Z = tensor_completion_als(warm_start, 20, 50)
# print(completed_Z)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [17]:
B_cell = completed_Z[:,:,0]

In [18]:
Myeloid_cell = completed_Z[:,:,1]

In [19]:
df_b_cell = pd.DataFrame(B_cell)
df_myeloid_cell = pd.DataFrame(Myeloid_cell)

# 设置行名
df_b_cell.index = sm_names
df_myeloid_cell.index = sm_names

In [20]:
cell_type_order
# sm_name_order

['B cells',
 'Myeloid cells',
 'NK cells',
 'T cells CD4+',
 'T cells CD8+',
 'T regulatory cells']

In [21]:
df_b_cell['cell_type'] = 'B cells'
df_myeloid_cell['cell_type'] = 'Myeloid cells'

# 使用pd.concat将两个DataFrame合并
result_df = pd.concat([df_b_cell, df_myeloid_cell], axis=0)

result_df['sm_name'] = result_df.index
result_df = result_df.reset_index(drop=True)


#### read in id file and create for submission

In [22]:
submission = pd.read_csv('data/sample_submission.csv') #  提交
id_pair = pd.read_csv('data/id_map.csv') #  每个id对应的药物与细胞类型
id_col = ["cell_type", "sm_name"]

new_id = id_pair.loc[:,id_col]
tensor_id = result_df.loc[:,id_col]
# new_id = pd.get_dummies(new_id, columns=new_id.columns)

In [23]:
for i in range(new_id.shape[0]):
    q = False
    for j in range(tensor_id.shape[0]):
        if all(new_id.iloc[i,:] == tensor_id.iloc[j,:]):  # This checks if all elements in a row match
            submission.iloc[i,1:] = result_df.iloc[j,0:18211]
            break  # Exit the inner loop once a match is found


In [24]:
submission.to_csv('tensor_3.csv', index=False)

In [25]:
submission.shape

(255, 18212)

In [4]:
from Bio import Entrez

def search_gene(gene_name):
    Entrez.email = "bchen342@wisc.edu"  
    handle = Entrez.esearch(db="gene", term=gene_name)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_gene_details(gene_id):
    handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    return records


gene_name = "BRCA1"  
gene_ids = search_gene(gene_name)
print(f"Found Gene IDs: {gene_ids}")

if gene_ids:
    gene_details = fetch_gene_details(gene_ids[0])
    print(gene_details)


Found Gene IDs: ['132901555', '132899599', '132895711', '132895683', '132895399', '132894635', '132891449', '132874090', '132873836', '132868918', '132866052', '7157', '1956', '7422', '3569', '7040', '22059', '4524', '3091', '2064']
[{'Entrezgene_track-info': {'Gene-track': {'Gene-track_geneid': '132901555', 'Gene-track_status': StringElement('0', attributes={'value': 'live'}), 'Gene-track_create-date': {'Date': {'Date_std': {'Date-std': {'Date-std_year': '2023', 'Date-std_month': '11', 'Date-std_day': '13'}}}}, 'Gene-track_update-date': {'Date': {'Date_std': {'Date-std': {'Date-std_year': '2023', 'Date-std_month': '11', 'Date-std_day': '14'}}}}}}, 'Entrezgene_type': StringElement('6', attributes={'value': 'protein-coding'}), 'Entrezgene_source': {'BioSource': {'BioSource_genome': StringElement('1', attributes={'value': 'genomic'}), 'BioSource_origin': StringElement('1', attributes={'value': 'natural'}), 'BioSource_org': {'Org-ref': {'Org-ref_taxname': 'Neoarius graeffei', 'Org-ref_com