In [5]:
import pandas as pd

# df = pd.read_csv("GSE33000_raw_data.txt", sep="\t", comment='!', low_memory=False)
# pd.set_option('display.max_columns', None)  # Show all columns if needed

# print(df['reporterID'])        # Show first 5 rows
# print(df.shape)         # See how many rows × columns
# print(df.columns[:30])

import pandas as pd
import gzip, io, re

def _is_gzip(path):
    with open(path, 'rb') as f:
        return f.read(2) == b'\x1f\x8b'

def _find_header_index(path):
    # caută primul rând de header care începe cu "ID\t"
    if _is_gzip(path):
        opener = lambda p: io.TextIOWrapper(gzip.open(p, 'rb'), encoding='utf-8', errors='replace')
    else:
        opener = lambda p: open(p, 'r', encoding='utf-8', errors='replace')
    with opener(path) as f:
        for i, line in enumerate(f):
            if line.lstrip().startswith('ID\t'):
                return i
    return None

def read_geo_annot(path):
    header_idx = _find_header_index(path)
    if header_idx is None:
        raise RuntimeError("Nu am găsit linia de header (cea care începe cu 'ID\\t'). Verifică fișierul .annot.")

    compression = 'gzip' if _is_gzip(path) else 'infer'
    # comentariile GEO încep de obicei cu '!' — le ignorăm
    df = pd.read_csv(
        path,
        sep="\t",
        header=0,
        skiprows=header_idx,   # prima linie după skip devine header
        dtype=str,
        engine="python",
        comment="!",
        compression=compression
        # poți adăuga on_bad_lines="skip" dacă mai rămân rânduri atipice
    )

    # identifică coloanele ID și SYMBOL
    id_candidates  = ["ID"]
    sym_candidates = ["Gene symbol"]

    id_col  = next((c for c in id_candidates  if c in df.columns), None)
    sym_col = next((c for c in sym_candidates if c in df.columns), None)
    if id_col is None or sym_col is None:
        raise ValueError(f"Nu am găsit coloanele pentru ID/SYMBOL. Coloane disponibile: {list(df.columns)}")

    def clean_symbol(x):
        if pd.isna(x) or str(x).strip()=="":
            return None
        return re.split(r"\s*///\s*|;|,", str(x))[0].strip()

    df["_SYMBOL_CLEAN"] = df[sym_col].map(clean_symbol)

    map_id_to_symbol = (
        df.dropna(subset=[id_col, "_SYMBOL_CLEAN"])
          .drop_duplicates(subset=[id_col])
          .set_index(id_col)["_SYMBOL_CLEAN"]
          .to_dict()
    )
    return df, map_id_to_symbol

# utilizare:
path = "GPL4372.annot"  # pune calea ta reală
df_annot, map_id2sym = read_geo_annot(path)

# print(map_id2sym)

df_id_symbol = pd.DataFrame(
    list(map_id2sym.items()), 
    columns=['gene_id', 'symbol']
)

print(df_id_symbol.head())
df_id_symbol.to_csv("human_genes.csv", index=False)


# exemplu aplicare pe un DataFrame de expresii care are coloana "ID_REF"
# expr["SYMBOL"] = expr["ID_REF"].map(map_id2sym)

       gene_id     symbol
0  10025930335        XPA
1  10025913794  LOC439911
2  10023807248    SLC35A3
3  10023809851    KATNAL1
4  10025911312     LRRTM4


In [2]:
import pandas as pd
from pathlib import Path

def read_soft_sample_table(path):
    """
    Citește un fișier SOFT (GSM) și returnează DataFrame-ul dintre
    !sample_table_begin / !sample_table_end.
    """
    path = Path(path)
    lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
    in_table = False
    table_lines = []
    for line in lines:
        if line.startswith("!sample_table_begin"):
            in_table = True
            continue
        if line.startswith("!sample_table_end"):
            break
        if in_table:
            table_lines.append(line)

    if not table_lines:
        raise RuntimeError("Nu am găsit secțiunea de tabel în fișier (verifică delimitatorii).")

    # Prima linie este headerul (ex: ID_REF\tVALUE)
    from io import StringIO
    df = pd.read_csv(StringIO("\n".join(table_lines)), sep="\t", dtype=str)
    return df

# Exemplu de utilizare:
df_mouse = read_soft_sample_table("GSM1570255.txt")
print(df_mouse.head())
print(df_mouse.shape)

          ID_REF    VALUE
0  0610005K03RIK  6.62031
1  0610006F02RIK  6.81477
2  0610006I08RIK  11.0488
3  0610006K04RIK  8.93425
4  0610007C21RIK  11.8489
(12558, 2)


In [17]:
# pip install mygene
from mygene import MyGeneInfo
import pandas as pd
import time

mg = MyGeneInfo()

# Assuming df_mouse already exists and has column ID_REF
symbols = df_mouse["ID_REF"].dropna().unique().tolist()

def batch(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i:i+size]

all_results = []
for chunk in batch(symbols, 1000):
    r = mg.querymany(
        chunk,
        scopes="symbol,alias",
        fields="symbol,entrezgene,ensembl.gene",
        species="mouse",
        as_dataframe=False
    )
    all_results.extend(r)
    time.sleep(0.2)  # polite pause

# Build a dict: query -> chosen record
chosen = {}
for item in all_results:
    q = item.get("query")
    if q is None:
        continue
    if item.get("notfound"):
        # Mark explicitly as not found (unless already a better hit exists)
        if q not in chosen:
            chosen[q] = None
        continue
    # If we already have a chosen hit, decide if this one is "better"
    prev = chosen.get(q)
    if prev is None:
        # Replace a None (notfound) with a valid hit
        chosen[q] = item
        continue
    if prev:  # both are valid hits
        # Prefer one with an entrezgene
        prev_has_entrez = prev.get("entrezgene") is not None
        curr_has_entrez = item.get("entrezgene") is not None
        if curr_has_entrez and not prev_has_entrez:
            chosen[q] = item
        # (If both or neither have entrez, keep the first)
    else:
        chosen[q] = item

# Build final mapping list
mapping_rows = []
for q in symbols:
    rec = chosen.get(q)
    if rec is None:
        mapping_rows.append({"ID_REF": q, "SYMBOL_FINAL": "NONE"})
    else:
        official_symbol = rec.get("symbol")
        # If symbol somehow missing though record exists, still mark NONE
        mapping_rows.append({
            "ID_REF": q,
            "SYMBOL_FINAL": official_symbol if official_symbol else "NONE"
        })

df_final = pd.DataFrame(mapping_rows)

# (Optional) Preserve original order as in df_mouse:
df_final = df_mouse[["ID_REF"]].merge(df_final, on="ID_REF", how="left")

# If any ID_REF was not in symbols due to NaN filtering, ensure NONE
df_final["SYMBOL_FINAL"] = df_final["SYMBOL_FINAL"].fillna("NONE")
df_final.loc[df_final["SYMBOL_FINAL"] == df_final["ID_REF"], "SYMBOL_FINAL"] = "NONE"

print(df_final.head())

# Save only the two columns
df_final.to_csv("GSM1570255_id_symbol.tsv", sep="\t", index=False)
print("Saved: GSM1570255_id_symbol.tsv")

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
4 input query terms found dup hits:	[('1700001C14RIK', 2), ('1700067P10RIK', 2), ('2810002G02RIK', 2), ('2810408M09RIK', 2)]
18 input query terms found no hit:	['0610008A10RIK', '0610025P10RIK', '1110020G09RIK', '1110034A24RIK', '1110064P04RIK', '1300007L22RIK
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
21 input query terms found dup hits:	[('4921506I22RIK', 2), ('4930555G01RIK', 2), ('4933428G20RIK', 2), ('6430411K18RIK', 2), ('6430706D2
46 input query terms found no hit:	['3110048E14RIK', '4632417D23', '4632417K02', '4921511C16', '4921525H12', '4932441K18', '5330431N19R
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
42 input 

          ID_REF SYMBOL_FINAL
0  0610005K03RIK        Lypd2
1  0610006F02RIK        Tmt1b
2  0610006I08RIK      Tmem223
3  0610006K04RIK       Nudt22
4  0610007C21RIK       Atraid
Saved: GSM1570255_id_symbol.tsv


In [7]:
import requests
import pandas as pd
from io import StringIO

# Example df_mouse
# df_mouse = pd.DataFrame({'ID_REF': ['5330401P04RIK', 'AnotherID', ...]})

results = []

print(len(df_mouse))
i = 0

for gene_id in df_mouse['ID_REF']:
    # Construct the URL dynamically for each gene
    i += 1
    print(i)
    url = f"https://www.mousemine.org/mousemine/service/query/results?query=%3Cquery+name%3D%22%22+model%3D%22genomic%22+view%3D%22Gene.primaryIdentifier+Gene.symbol+Gene.organism.name+Gene.homologues.homologue.primaryIdentifier+Gene.homologues.homologue.symbol+Gene.homologues.homologue.organism.name+Gene.homologues.type+Gene.homologues.dataSets.name+Gene.name+Gene.homologues.gene.organism.name+Gene.homologues.gene.primaryIdentifier+Gene.homologues.gene.symbol%22+longDescription%3D%22Returns+homologs+of+specified+genes.%22+sortOrder%3D%22Gene.primaryIdentifier+asc%22+constraintLogic%3D%22A+and+B+and+D%22%3E%3Cconstraint+path%3D%22Gene.homologues.type%22+code%3D%22B%22+op%3D%22NONE+OF%22%3E%3Cvalue%3Ehorizontal+gene+transfer%3C%2Fvalue%3E%3Cvalue%3Eleast+diverged+horizontal+gene+transfer%3C%2Fvalue%3E%3C%2Fconstraint%3E%3Cconstraint+path%3D%22Gene%22+code%3D%22A%22+op%3D%22LOOKUP%22+value%3D%22{gene_id}%22+extraValue%3D%22%22%2F%3E%3Cconstraint+path%3D%22Gene.homologues.homologue.organism.name%22+code%3D%22D%22+op%3D%22%3D%22+value%3D%22Homo+sapiens%22%2F%3E%3C%2Fquery%3E&format=tab&size=1"
    
    resp = requests.get(url)
    
    if resp.status_code == 200 and resp.text.strip():
        data = StringIO(resp.text)
        df = pd.read_csv(data, sep="\t", header=None)
        # Keep only gene_id, symbol (df[1]), homolog_symbol (df[4])
        for _, row in df.iterrows():
            results.append({
                'gene_id': gene_id,
                'symbol': row[1],
                'homolog_symbol': row[4]
            })
    else:
        # If no response, put None
        results.append({
            'gene_id': gene_id,
            'symbol': None,
            'homolog_symbol': None
        })
    print(results[-1])

# Combine into a final DataFrame
final_df = pd.DataFrame(results)
print(final_df)


12558
1
{'gene_id': '0610005K03RIK', 'symbol': 'Lypd2', 'homolog_symbol': 'LYPD2'}
2
{'gene_id': '0610006F02RIK', 'symbol': 'Tmt1b', 'homolog_symbol': 'TMT1B'}
3
{'gene_id': '0610006I08RIK', 'symbol': 'Tmem223', 'homolog_symbol': 'TMEM223'}
4
{'gene_id': '0610006K04RIK', 'symbol': 'Nudt22', 'homolog_symbol': 'NUDT22'}
5
{'gene_id': '0610007C21RIK', 'symbol': 'Atraid', 'homolog_symbol': 'ATRAID'}
6
{'gene_id': '0610007H07RIK', 'symbol': 'Tmbim4', 'homolog_symbol': 'TMBIM4'}
7
{'gene_id': '0610007P06RIK', 'symbol': 'Hikeshi', 'homolog_symbol': 'HIKESHI'}
8
{'gene_id': '0610007P08RIK', 'symbol': 'Ercc6l2', 'homolog_symbol': 'ERCC6L2'}
9
{'gene_id': '0610007P14RIK', 'symbol': 'Erg28', 'homolog_symbol': 'ERG28'}
10
{'gene_id': '0610007P22RIK', 'symbol': 'Tsr3', 'homolog_symbol': 'TSR3'}
11
{'gene_id': '0610008A10RIK', 'symbol': 'Aph1c', 'homolog_symbol': 'APH1B'}
12
{'gene_id': '0610008C08RIK', 'symbol': 'Apoo', 'homolog_symbol': 'APOO'}
13
{'gene_id': '0610009B22RIK', 'symbol': 'Trappc2b',

In [8]:
final_df.to_csv("mouse_human_homologs.csv", index=False)

In [7]:
import pandas as pd

# Load csvs
df_human = pd.read_csv("human_genes.csv")  # columns: gene_id, symbol
df_mouse = pd.read_csv("mouse_genes_and_homologs.csv")  # columns: gene_id, symbol, homolog_symbol

# Merge where symbol from human matches homolog_symbol from mouse
df_merge = pd.merge(
    df_mouse,          # left dataframe
    df_human,          # right dataframe
    left_on="homolog_symbol",  # from mouse
    right_on="symbol",         # from human
    suffixes=("_mouse", "_human")
)

# Optional: reorder columns, for example:
# ['gene_id_mouse', 'symbol_mouse', 'homolog_symbol', 'gene_id_human', 'symbol_human']
df_merge = df_merge[
    ['gene_id_mouse', 'symbol_mouse', 'homolog_symbol', 'gene_id_human', 'symbol_human']
]

print(df_merge)
df_merge.to_csv("mouse_human_gene_mapping.csv", index=False)

       gene_id_mouse symbol_mouse homolog_symbol  gene_id_human symbol_human
0      0610005K03RIK        Lypd2          LYPD2    10031920194        LYPD2
1      0610006K04RIK       Nudt22         NUDT22    10025909135       NUDT22
2      0610007H07RIK       Tmbim4         TMBIM4    10025907946       TMBIM4
3      0610008A10RIK        Aph1c          APH1B    10023836439        APH1B
4      0610008C08RIK         Apoo           APOO    10023848086         APOO
...              ...          ...            ...            ...          ...
12439           ZW10         Zw10           ZW10    10023806899         ZW10
12440           ZXDA         ZXDA           ZXDB    10023842025         ZXDB
12441         ZYG11B       Zyg11b         ZYG11B    10023830425       ZYG11B
12442            ZYX          Zyx            ZYX    10023815205          ZYX
12443           ZZZ3         Zzz3           ZZZ3    10025912610         ZZZ3

[12444 rows x 5 columns]


In [3]:
import pandas as pd

csv1 = pd.read_csv('mouse_human_gene_mapping.csv')

# Remove duplicate rows based on gene_id_mouse, keeping the first occurrence
csv1_nodups = csv1.drop_duplicates(subset=['gene_id_mouse'], keep='first')

# Optional: Save to file
csv1_nodups.to_csv('mouse_human_gene_mapping.csv', index=False)

# Show result
print(csv1_nodups)
print(f"Number of unique gene_id_mouse: {len(csv1_nodups)}")

       gene_id_mouse symbol_mouse homolog_symbol  gene_id_human symbol_human
0      0610005K03RIK        Lypd2          LYPD2    10031920194        LYPD2
1      0610006K04RIK       Nudt22         NUDT22    10025909135       NUDT22
2      0610007H07RIK       Tmbim4         TMBIM4    10025907946       TMBIM4
3      0610008A10RIK        Aph1c          APH1B    10023836439        APH1B
4      0610008C08RIK         Apoo           APOO    10023848086         APOO
...              ...          ...            ...            ...          ...
12439           ZW10         Zw10           ZW10    10023806899         ZW10
12440           ZXDA         ZXDA           ZXDB    10023842025         ZXDB
12441         ZYG11B       Zyg11b         ZYG11B    10023830425       ZYG11B
12442            ZYX          Zyx            ZYX    10023815205          ZYX
12443           ZZZ3         Zzz3           ZZZ3    10025912610         ZZZ3

[9635 rows x 5 columns]
Number of unique gene_id_mouse: 9635
