<a href="https://colab.research.google.com/github/ergul13/mr_akgul/blob/main/BreastCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import os
import shutil
import pandas as pd
import cv2
import numpy as np
import glob
import pydicom
from google.colab import drive, files

# Ayarlar
base_dir = '/content/tubitak_data'
meta_dir = f'{base_dir}/meta'
images_dir = f'{base_dir}/images'

# Klasörleri kontrol et, yoksa oluştur (Silme yapmiyoruz ki metadata gitmesin)
os.makedirs(meta_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

print("----------------------------------------------------------------")
print("ADIM 0: KAGGLE API AYARLAMASI")
print("----------------------------------------------------------------")

# Kaggle API dosyasi kontrolü
if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("Kaggle API anahtari bulunamadi. Lutfen 'kaggle.json' dosyasini yukleyin.")
    uploaded = files.upload()

    for fn in uploaded.keys():
        if 'kaggle.json' in fn:
            os.system('mkdir -p ~/.kaggle')
            os.system(f'mv {fn} ~/.kaggle/')
            os.system('chmod 600 ~/.kaggle/kaggle.json')
            print("Kaggle API anahtari kuruldu.")
else:
    print("Kaggle API anahtari zaten mevcut.")

print("\n----------------------------------------------------------------")
print("ADIM 1: METADATA KONTROLU")
print("----------------------------------------------------------------")

# Onceki adimda yuklenen dosyalar duruyor mu?
rsna_csv_path = f'{meta_dir}/rsna_train.csv'
inbreast_xls_path = glob.glob(f'{meta_dir}/INbreast*')

if not os.path.exists(rsna_csv_path):
    print("UYARI: RSNA 'train.csv' dosyasi bulunamadi (silinmis olabilir).")
    print("Lutfen 'train.csv' dosyasini tekrar yukleyin.")
    uploaded = files.upload()
    for fn in uploaded.keys():
        shutil.move(fn, rsna_csv_path)

if not inbreast_xls_path:
    print("UYARI: INbreast excel dosyasi bulunamadi.")
    print("Lutfen 'INbreast.xls' dosyasini tekrar yukleyin.")
    uploaded = files.upload()
    for fn in uploaded.keys():
        dest = f'{meta_dir}/INbreast_meta' + os.path.splitext(fn)[1]
        shutil.move(fn, dest)

print("Metadata dosyalari hazir.")

print("\n----------------------------------------------------------------")
print("ADIM 2: GORUNTULERI INDIRME (KAGGLE)")
print("----------------------------------------------------------------")

# RSNA
print("RSNA Goruntuleri indiriliyor...")
os.system(f'kaggle datasets download -d theoviel/rsna-breast-cancer-512-pngs --force -p {base_dir}')

if os.path.exists(f'{base_dir}/rsna-breast-cancer-512-pngs.zip'):
    print("RSNA Zip indirildi, aciliyor...")
    os.system(f'unzip -q {base_dir}/rsna-breast-cancer-512-pngs.zip -d {base_dir}/rsna_temp')
else:
    raise FileNotFoundError("HATA: RSNA zip dosyasi hala inemedi. kaggle.json dosyanizin guncel oldugundan emin olun.")

# INbreast
print("INbreast Goruntuleri indiriliyor...")
os.system(f'kaggle datasets download -d mccork12/inbreast-dcm-images --force -p {base_dir}')

if os.path.exists(f'{base_dir}/inbreast-dcm-images.zip'):
    print("INbreast Zip indirildi, aciliyor...")
    os.system(f'unzip -q {base_dir}/inbreast-dcm-images.zip -d {base_dir}/inbreast_temp')
else:
    print("UYARI: INbreast goruntuleri inemedi. Sadece RSNA ile devam edilecek.")

print("\n----------------------------------------------------------------")
print("ADIM 3: VERI ISLEME VE BIRLESTIRME")
print("----------------------------------------------------------------")

final_data = []

# --- RSNA ISLEME ---
print("RSNA verileri isleniyor...")
rsna_df = pd.read_csv(rsna_csv_path)

def get_rsna_label(row):
    if row['cancer'] == 1: return 5
    elif row['cancer'] == 0 and (row['biopsy'] == 1 or row['difficult_negative_case'] == True): return 2
    else: return 1

rsna_df['BIRADS'] = rsna_df.apply(get_rsna_label, axis=1)

rsna_files = glob.glob(f'{base_dir}/rsna_temp/*.png')
rsna_map = {os.path.basename(f): f for f in rsna_files}

count_rsna = 0
for idx, row in rsna_df.iterrows():
    fname = f"{row['patient_id']}_{row['image_id']}.png"
    if fname in rsna_map:
        new_name = f"RSNA_{fname}"
        try:
            shutil.move(rsna_map[fname], f'{images_dir}/{new_name}')
            final_data.append([new_name, row['BIRADS'], 'RSNA'])
            count_rsna += 1
        except: pass

print(f"RSNA Tamamlandi: {count_rsna} goruntu.")

# --- INbreast ISLEME ---
print("INbreast verileri isleniyor...")
meta_files = glob.glob(f'{base_dir}/meta/INbreast_meta*')
if meta_files and os.path.exists(f'{base_dir}/inbreast_temp'):
    try:
        if meta_files[0].endswith('.csv'):
            in_df = pd.read_csv(meta_files[0], sep=';')
        else:
            in_df = pd.read_excel(meta_files[0])

        in_df.columns = [c.strip() for c in in_df.columns]
        f_col = next((c for c in in_df.columns if 'file' in c.lower()), None)
        b_col = next((c for c in in_df.columns if 'bi' in c.lower()), None)

        if f_col and b_col:
            def map_ib(val):
                s = str(val).lower().replace(' ','').replace('a','').replace('b','')
                try: sc = int(float(s))
                except: return -1
                if sc == 1: return 1
                if sc in [2,3]: return 2
                if sc == 4: return 4
                if sc >= 5: return 5
                return -1

            in_df['BIRADS'] = in_df[b_col].apply(map_ib)

            dcm_files = glob.glob(f'{base_dir}/inbreast_temp/**/*.dcm', recursive=True)
            dcm_map = {os.path.splitext(os.path.basename(f))[0]: f for f in dcm_files}

            count_ib = 0
            for _, row in in_df.iterrows():
                fid = str(row[f_col])
                if fid in dcm_map and row['BIRADS'] != -1:
                    try:
                        ds = pydicom.dcmread(dcm_map[fid])
                        img = ds.pixel_array
                        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
                        img = cv2.resize(img, (512, 512))

                        new_name = f"INbreast_{fid}.png"
                        cv2.imwrite(f'{images_dir}/{new_name}', img)
                        final_data.append([new_name, row['BIRADS'], 'INbreast'])
                        count_ib += 1
                    except: pass
            print(f"INbreast Tamamlandi: {count_ib} goruntu.")
    except Exception as e:
        print(f"HATA: INbreast isleme: {e}")

# --- KAYIT ---
print("\n----------------------------------------------------------------")
print("ADIM 4: KAYIT")
print("----------------------------------------------------------------")

final_df = pd.DataFrame(final_data, columns=['filename', 'BIRADS', 'source'])
save_path = '/content/drive/MyDrive/TUBITAK_TRAIN.csv'
final_df.to_csv(save_path, index=False)

print(f"CSV Kaydedildi: {save_path}")
print(f"Toplam Goruntu Sayisi: {len(final_df)}")
print(final_df.groupby(['source', 'BIRADS']).size())

----------------------------------------------------------------
ADIM 0: KAGGLE API AYARLAMASI
----------------------------------------------------------------
Kaggle API anahtari zaten mevcut.

----------------------------------------------------------------
ADIM 1: METADATA KONTROLU
----------------------------------------------------------------
Metadata dosyalari hazir.

----------------------------------------------------------------
ADIM 2: GORUNTULERI INDIRME (KAGGLE)
----------------------------------------------------------------
RSNA Goruntuleri indiriliyor...
RSNA Zip indirildi, aciliyor...
INbreast Goruntuleri indiriliyor...
UYARI: INbreast goruntuleri inemedi. Sadece RSNA ile devam edilecek.

----------------------------------------------------------------
ADIM 3: VERI ISLEME VE BIRLESTIRME
----------------------------------------------------------------
RSNA verileri isleniyor...
RSNA Tamamlandi: 54706 goruntu.
INbreast verileri isleniyor...

-----------------------------

In [3]:
!pip install pydicom
import pandas as pd
import os
import shutil
import cv2
import numpy as np
import pydicom
import glob
from google.colab import drive

# 1. Drive Baglantisi
drive.mount('/content/drive')

# Ayarlar
base_dir = '/content/tubitak_final'
images_dir = os.path.join(base_dir, 'images')
extract_path = os.path.join(base_dir, 'temp_inbreast')

# Klasor Temizligi
if os.path.exists(extract_path):
    shutil.rmtree(extract_path)
os.makedirs(images_dir, exist_ok=True)
os.makedirs(extract_path, exist_ok=True)

print("------------------------------------------------")
print("DRIVE UZERINDEN VERI ISLEME")
print("------------------------------------------------")

# 2. Dosyalari Drive'da Bulma (Isim degisikliklerine karsi esnek arama)
print("Drive taraniyor...")
drive_root = '/content/drive/MyDrive'
found_zip = None
found_meta = None

# Drive ana dizinini tara
for f in os.listdir(drive_root):
    f_lower = f.lower()
    if 'alldicoms' in f_lower and f_lower.endswith('.zip'):
        found_zip = os.path.join(drive_root, f)
    if 'inbreast' in f_lower and (f_lower.endswith('.xls') or f_lower.endswith('.csv') or f_lower.endswith('.xlsx')):
        found_meta = os.path.join(drive_root, f)

if found_zip and found_meta:
    print(f"Zip Dosyasi Bulundu: {found_zip}")
    print(f"Etiket Dosyasi Bulundu: {found_meta}")

    # 3. Zip Acma (Drive'dan Colab temp alanina)
    print("Zip dosyasi aciliyor (Bu islem 1-2 dakika surebilir)...")
    shutil.unpack_archive(found_zip, extract_path)

    # 4. Metadata Okuma
    print("Etiketler okunuyor...")
    try:
        if found_meta.endswith('.csv'):
            # Ayirici hatasi olmamasi icin once ; sonra , dener
            try:
                df = pd.read_csv(found_meta, sep=';')
                if len(df.columns) < 2: df = pd.read_csv(found_meta, sep=',')
            except:
                df = pd.read_csv(found_meta)
        else:
            df = pd.read_excel(found_meta)

        # Sutun temizligi
        df.columns = [c.strip() for c in df.columns]

        # Gerekli sutunlari bul
        file_col = next((c for c in df.columns if 'file' in c.lower()), None)
        birads_col = next((c for c in df.columns if 'bi' in c.lower()), None)

        if file_col and birads_col:
            # BI-RADS Haritalama (1, 2, 4, 5)
            def map_birads(val):
                s = str(val).lower().replace(' ','').replace('a','').replace('b','')
                try: score = int(float(s))
                except: return -1
                if score == 1: return 1
                if score in [2,3]: return 2
                if score == 4: return 4 # Eksik olan sinif
                if score >= 5: return 5
                return -1

            df['BIRADS'] = df[birads_col].apply(map_birads)
            df = df[df['BIRADS'] != -1]

            # 5. Goruntu Isleme
            print(f"Toplam {len(df)} INbreast verisi isleniyor...")

            # Zip'ten cikan dosyalari listele
            dcm_files = glob.glob(os.path.join(extract_path, '**', '*.dcm'), recursive=True)
            dcm_map = {os.path.splitext(os.path.basename(f))[0]: f for f in dcm_files}

            processed_data = []

            for idx, row in df.iterrows():
                fid = str(row[file_col])
                if fid in dcm_map:
                    try:
                        dcm_path = dcm_map[fid]
                        ds = pydicom.dcmread(dcm_path)
                        img = ds.pixel_array
                        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
                        img = cv2.resize(img, (512, 512))

                        fname = f"INbreast_{fid}.png"
                        save_path = os.path.join(images_dir, fname)
                        cv2.imwrite(save_path, img)

                        processed_data.append([fname, row['BIRADS'], 'INbreast'])
                    except: pass

            print(f"Basariyla islenen INbreast goruntusu: {len(processed_data)}")

            # 6. Birlestirme ve Kayit
            rsna_path = '/content/drive/MyDrive/TUBITAK_TRAIN.csv'
            if os.path.exists(rsna_path):
                rsna_df = pd.read_csv(rsna_path)
                # Sadece RSNA verilerini al
                rsna_df = rsna_df[rsna_df['source'] == 'RSNA']

                inbreast_df = pd.DataFrame(processed_data, columns=['filename', 'BIRADS', 'source'])

                final_df = pd.concat([rsna_df, inbreast_df], ignore_index=True)

                final_save_path = '/content/drive/MyDrive/TUBITAK_FULL_DATASET.csv'
                final_df.to_csv(final_save_path, index=False)

                print("------------------------------------------------")
                print("ISLEM TAMAMLANDI")
                print(f"Veri seti kaydedildi: {final_save_path}")
                print("Sinif Dagilimi:")
                print(final_df.groupby(['source', 'BIRADS']).size())

                # INbreast goruntulerini de Drive'a yedekle (Bir daha ugrasmamak icin)
                print("Goruntuler Drive'a yedekleniyor...")
                shutil.make_archive("/content/drive/MyDrive/INbreast_Processed_Images", 'zip', images_dir)
                print("Yedekleme tamamlandi.")

            else:
                print("UYARI: RSNA verisi (TUBITAK_TRAIN.csv) Drive'da bulunamadi.")
                print("INbreast verisi tek basina kaydediliyor...")
                pd.DataFrame(processed_data, columns=['filename', 'BIRADS', 'source']).to_csv('/content/drive/MyDrive/INbreast_ONLY.csv', index=False)
        else:
            print("HATA: Excel dosyasinda gerekli sutunlar (File Name, Bi-Rads) bulunamadi.")
    except Exception as e:
        print(f"Islem hatasi: {e}")

else:
    print("HATA: Dosyalar Drive ana dizininde bulunamadi.")
    print("Lutfen 'AllDICOMs.zip' ve 'INbreast.xls' dosyalarini Drive ana dizinine yuklediginizden emin olun.")

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
------------------------------------------------
DRIVE UZERINDEN VERI ISLEME
------------------------------------------------
Drive taraniyor...
Zip Dosyasi Bulundu: /content/drive/MyDrive/AllDICOMs.zip
Etiket Dosyasi Bulundu: /content/drive/MyDrive/INbreast.csv
Zip dosyasi aciliyor (Bu islem 1-2 dakika surebilir)...
Etiketler okunuyor...
Toplam 388 INbreast verisi isleniyor...
Basariyla islenen INbreast goruntusu: 0
------