# **Proje Hazƒ±rlƒ±ƒüƒ± ve Veriyi Anlama**

## Dosya Yapƒ±sƒ± ƒ∞ncelemesi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import json
import os
from pathlib import Path

# Dosya yolunu ayarla
base_path = '/content/drive/MyDrive/Milano Telecom Data Analysis ‚Äî 2013 Week Dataset/archive/'

## Trafik Verisi ƒ∞ncelemesi

In [None]:
# 1. Trafik verisi √∂rneƒüi (ilk 100 satƒ±r)
print("=== TRAFƒ∞K VERƒ∞Sƒ∞ ƒ∞NCELEMESƒ∞ ===")
traffic_file = base_path + 'sms-call-internet-mi-2013-11-01.csv'

# S√ºtunlarƒ± g√∂r (√ßok b√ºy√ºk dosya, sadece header oku)
try:
    df_traffic_sample = pd.read_csv(traffic_file, nrows=100)
    print("√ñrnek veri (100 satƒ±r):")
    print(df_traffic_sample.head())
    print("\nS√ºtunlar:", df_traffic_sample.columns.tolist())
    print("Boyut:", df_traffic_sample.shape)

    # Veri tipleri
    print("\nVeri tipleri:")
    print(df_traffic_sample.dtypes)

    # Null deƒüerler
    print("\nNull deƒüer √∂zeti:")
    print(df_traffic_sample.isnull().sum())

except Exception as e:
    print(f"Hata: {e}")
    # Alternatif: sadece ilk satƒ±rƒ± oku
    with open(traffic_file, 'r') as f:
        header = f.readline()
    print("Header:", header)

## Hareket Verisi ƒ∞ncelemesi

In [None]:
print("\n=== HAREKET VERƒ∞Sƒ∞ ƒ∞NCELEMESƒ∞ ===")
movement_file = base_path + 'mi-to-provinces-2013-11-01.csv'

try:
    df_movement_sample = pd.read_csv(movement_file, nrows=100)
    print("√ñrnek veri (100 satƒ±r):")
    print(df_movement_sample.head())
    print("\nS√ºtunlar:", df_movement_sample.columns.tolist())
    print("Boyut:", df_movement_sample.shape)

except Exception as e:
    print(f"Hata: {e}")
    with open(movement_file, 'r') as f:
        header = f.readline()
    print("Header:", header)

## Census Verisi ƒ∞ncelemesi

In [None]:
print("\n=== CENSUS VERƒ∞Sƒ∞ ƒ∞NCELEMESƒ∞ ===")
census_file = base_path + 'ISTAT_census_variables_2011.csv'

df_census = pd.read_csv(census_file)
print("ƒ∞lk 5 satƒ±r:")
print(df_census.head())
print("\nS√ºtunlar:", df_census.columns.tolist())
print("Boyut:", df_census.shape)
print("\nVeri tipleri:")
print(df_census.dtypes)

## Geojson Dosyalarƒ±nƒ±n ƒ∞ncelenmesi

In [None]:
print("\n=== GEOJSON DOSYALARI ===")
# Milano grid
with open(base_path + 'milano-grid.geojson', 'r') as f:
    milano_grid = json.load(f)
print(f"Milano grid: {len(milano_grid['features'])} √∂zellik")

# ƒ∞talya eyaletleri
with open(base_path + 'Italian_provinces.geojson', 'r') as f:
    italian_provinces = json.load(f)
print(f"ƒ∞talyan eyaletleri: {len(italian_provinces['features'])} √∂zellik")

## T√ºm Dosya Boyutlarƒ±nƒ±n Kontrol√º

In [None]:
print("\n=== T√úM DOSYA Lƒ∞STESƒ∞ VE BOYUTLARI ===")
import os

# T√ºm CSV ve JSON dosyalarƒ±nƒ± bul
all_files = []
for file in os.listdir(base_path):
    if file.endswith(('.csv', '.json', '.geojson')):
        all_files.append(file)

# Alfabetik sƒ±rala
all_files.sort()

print(f"Toplam {len(all_files)} dosya bulundu:")
print("-" * 50)

for i, file in enumerate(all_files, 1):
    file_path = os.path.join(base_path, file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"{i:2d}. {file:45s} - {size_mb:8.2f} MB")

print("-" * 50)
total_size_mb = sum(os.path.getsize(os.path.join(base_path, f)) / (1024 * 1024) for f in all_files)
print(f"Toplam boyut: {total_size_mb:.2f} MB ({total_size_mb/1024:.2f} GB)")

# **BigQuery'ye Dosya Y√ºkleme Planƒ±**

## Dosya Yolu ve Proje Bilgileri

In [None]:
# Colab'da √ßalƒ±≈ütƒ±r
from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
import pandas as pd
import geopandas as gpd
import json
from google.cloud.exceptions import NotFound

# Konfig√ºrasyon
PROJECT_ID = "YOUR_PROJECT_ID"
DATASET_ID = "milano_mobile_2013"
BASE_PATH = "/content/drive/MyDrive/Milano Telecom Data Analysis ‚Äî 2013 Week Dataset/archive/"

client = bigquery.Client(project=PROJECT_ID)

## Dataset Varlƒ±k Kontrol√º

In [None]:
# Dataset var mƒ± kontrol et
dataset_ref = client.dataset(DATASET_ID)
try:
    client.get_dataset(dataset_ref)
    print(f"‚úÖ Dataset bulundu: {DATASET_ID}")
except NotFound:
    print(f"‚ùå Dataset bulunamadƒ±: {DATASET_ID}")
    # Eƒüer yoksa olu≈ütur (manuel olu≈üturmu≈ütunuz, bu gerekmez)

## Tablolarƒ±n Y√ºklenmesi

In [None]:
def load_all_tables():
    """T√ºm dosyalarƒ± BigQuery'e y√ºkler"""

    # 1. CENSUS VERƒ∞Sƒ∞
    print("üìä 1. Census verisi y√ºkleniyor...")
    census_path = BASE_PATH + "ISTAT_census_variables_2011.csv"
    df_census = pd.read_csv(census_path)

    table_id = f"{PROJECT_ID}.{DATASET_ID}.census_data"
    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
    )

    job = client.load_table_from_dataframe(df_census, table_id, job_config=job_config)
    job.result()
    print(f"   ‚úÖ census_data: {client.get_table(table_id).num_rows:,} satƒ±r")

    # 2. TRAFƒ∞K VERƒ∞Sƒ∞ (7 g√ºn)
    print("\nüì∂ 2. Trafik verisi y√ºkleniyor (7 g√ºn)...")
    traffic_files = [
        "sms-call-internet-mi-2013-11-01.csv",
        "sms-call-internet-mi-2013-11-02.csv",
        "sms-call-internet-mi-2013-11-03.csv",
        "sms-call-internet-mi-2013-11-04.csv",
        "sms-call-internet-mi-2013-11-05.csv",
        "sms-call-internet-mi-2013-11-06.csv",
        "sms-call-internet-mi-2013-11-07.csv"
    ]

    for i, file in enumerate(traffic_files, 1):
        print(f"   üìÖ G√ºn {i}/7: {file}")
        file_path = BASE_PATH + file
        date_str = file[-15:-4]  # "2013-11-01" formatƒ±

        # CSV'yi oku
        df_chunk = pd.read_csv(file_path)

        # datetime s√ºtununu d√∂n√º≈üt√ºr
        df_chunk['datetime'] = pd.to_datetime(df_chunk['datetime'])

        # load_date ekle
        df_chunk['load_date'] = pd.to_datetime(date_str).date()

        # Tablo adƒ±: traffic_YYYYMMDD
        table_name = f"traffic_{date_str.replace('-', '')}"
        table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_name}"

        job = client.load_table_from_dataframe(df_chunk, table_id, job_config=job_config)
        job.result()
        print(f"     ‚úÖ {table_name}: {df_chunk.shape[0]:,} satƒ±r")

    # 3. HAREKET VERƒ∞Sƒ∞ (7 g√ºn)
    print("\nüó∫Ô∏è  3. Hareket verisi y√ºkleniyor (7 g√ºn)...")
    movement_files = [
        "mi-to-provinces-2013-11-01.csv",
        "mi-to-provinces-2013-11-02.csv",
        "mi-to-provinces-2013-11-03.csv",
        "mi-to-provinces-2013-11-04.csv",
        "mi-to-provinces-2013-11-05.csv",
        "mi-to-provinces-2013-11-06.csv",
        "mi-to-provinces-2013-11-07.csv"
    ]

    for i, file in enumerate(movement_files, 1):
        print(f"   üìÖ G√ºn {i}/7: {file}")
        file_path = BASE_PATH + file
        date_str = file[-15:-4]

        df_chunk = pd.read_csv(file_path)
        df_chunk['datetime'] = pd.to_datetime(df_chunk['datetime'])
        df_chunk['load_date'] = pd.to_datetime(date_str).date()

        table_name = f"movement_{date_str.replace('-', '')}"
        table_id = f"{PROJECT_ID}.{DATASET_ID}.{table_name}"

        job = client.load_table_from_dataframe(df_chunk, table_id, job_config=job_config)
        job.result()
        print(f"     ‚úÖ {table_name}: {df_chunk.shape[0]:,} satƒ±r")

    # 4. GEOJSON DOSYALARI (D√úZELTƒ∞LMƒ∞≈û)
    print("\nüó∫Ô∏è  4. Coƒürafi veriler y√ºkleniyor...")

    # Milano grid - DOƒûRU S√úTUN ƒ∞Sƒ∞MLERƒ∞YLE
    print("   üìç Milano grid...")
    grid_path = BASE_PATH + "milano-grid.geojson"
    grid_gdf = gpd.read_file(grid_path)

    # 'cellId' s√ºtununu 'CellID' olarak standardize et
    grid_gdf = grid_gdf.rename(columns={'cellId': 'CellID'})

    # GeoJSON'dan DataFrame'e - sadece gerekli s√ºtunlar
    grid_df = pd.DataFrame({
        'CellID': grid_gdf['CellID'],
        'geometry': grid_gdf.geometry.astype(str)
    })

    table_id = f"{PROJECT_ID}.{DATASET_ID}.grid_locations"
    job = client.load_table_from_dataframe(grid_df, table_id, job_config=job_config)
    job.result()
    print(f"     ‚úÖ grid_locations: {grid_df.shape[0]:,} satƒ±r")
    print(f"     üîç √ñrnek CellID'ler: {grid_df['CellID'].head().tolist()}")

    # ƒ∞talya eyaletleri - DOƒûRU S√úTUN ƒ∞Sƒ∞MLERƒ∞YLE
    print("   üáÆüáπ ƒ∞talya eyaletleri...")
    provinces_path = BASE_PATH + "Italian_provinces.geojson"
    provinces_gdf = gpd.read_file(provinces_path)

    # 'PROVINCIA' s√ºtununu 'province_name' olarak standardize et
    provinces_df = pd.DataFrame({
        'province_name': provinces_gdf['PROVINCIA'],
        'province_code': provinces_gdf['SIGLA'],  # SIGLA = kƒ±saltma (TO, MI vb.)
        'shape_area': provinces_gdf['SHAPE_AREA'],
        'geometry': provinces_gdf.geometry.astype(str)
    })

    table_id = f"{PROJECT_ID}.{DATASET_ID}.province_boundaries"
    job = client.load_table_from_dataframe(provinces_df, table_id, job_config=job_config)
    job.result()
    print(f"     ‚úÖ province_boundaries: {provinces_df.shape[0]:,} satƒ±r")
    print(f"     üîç √ñrnek eyaletler: {provinces_df['province_name'].head().tolist()}")

    print("\nüéâ T√úM TABLOLAR BA≈ûARIYLA Y√úKLENDƒ∞!")

# Fonksiyonu √ßalƒ±≈ütƒ±r
load_all_tables()

## Birle≈üik Tablolar Olu≈üturma

### *7 g√ºnl√ºk trafik ve hareket verisi i√ßin birle≈üik view'ler*

In [None]:
def create_combined_views():
    """Birle≈üik view'lar olu≈ütur"""

    # 1. T√úM TRAFƒ∞K VERƒ∞Sƒ∞ (7 g√ºn)
    print("üîÑ T√ºm trafik view'ƒ± olu≈üturuluyor...")

    query = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.{DATASET_ID}.traffic_all_days` AS
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131101` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131102` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131103` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131104` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131105` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131106` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.traffic_20131107`
    """

    client.query(query).result()
    print("   ‚úÖ traffic_all_days view olu≈üturuldu")

    # 2. T√úM HAREKET VERƒ∞Sƒ∞ (7 g√ºn)
    print("üîÑ T√ºm hareket view'ƒ± olu≈üturuluyor...")

    query = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.{DATASET_ID}.movement_all_days` AS
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131101` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131102` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131103` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131104` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131105` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131106` UNION ALL
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.movement_20131107`
    """

    client.query(query).result()
    print("   ‚úÖ movement_all_days view olu≈üturuldu")

    # 3. G√úNL√úK √ñZET TABLOSU
    print("üîÑ G√ºnl√ºk √∂zet view'ƒ± olu≈üturuluyor...")

    query = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.{DATASET_ID}.daily_summary` AS
    SELECT
        DATE(datetime) as date,
        EXTRACT(HOUR FROM datetime) as hour,
        COUNT(DISTINCT CellID) as unique_cells,
        SUM(COALESCE(smsin, 0)) as total_smsin,
        SUM(COALESCE(smsout, 0)) as total_smsout,
        SUM(COALESCE(callin, 0)) as total_callin,
        SUM(COALESCE(callout, 0)) as total_callout,
        SUM(COALESCE(internet, 0)) as total_internet
    FROM `{PROJECT_ID}.{DATASET_ID}.traffic_all_days`
    GROUP BY date, hour
    ORDER BY date, hour
    """

    client.query(query).result()
    print("   ‚úÖ daily_summary view olu≈üturuldu")

    # 4. EYALET BAZLI HAREKET √ñZETƒ∞
    print("üîÑ Eyalet hareket √∂zeti view'ƒ± olu≈üturuluyor...")

    query = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.{DATASET_ID}.province_movement_summary` AS
    SELECT
        DATE(datetime) as date,
        provinceName,
        SUM(COALESCE(cell2Province, 0)) as total_from_milano,
        SUM(COALESCE(Province2cell, 0)) as total_to_milano,
        COUNT(DISTINCT CellID) as unique_cells_connected
    FROM `{PROJECT_ID}.{DATASET_ID}.movement_all_days`
    GROUP BY date, provinceName
    ORDER BY date, total_from_milano DESC
    """

    client.query(query).result()
    print("   ‚úÖ province_movement_summary view olu≈üturuldu")

    print("\nüéâ T√úM VIEW'LAR OLU≈ûTURULDU!")

# View'larƒ± olu≈ütur
create_combined_views()

## Doƒürulama Sorgularƒ±

In [None]:
def verify_tables():
    """Tablolarƒ± doƒürula"""

    print("üîç TABLO DOƒûRULAMASI")
    print("=" * 50)

    # Dataset referansƒ±
    dataset_ref = client.dataset(DATASET_ID)

    table_info = []
    try:
        # Dataset'teki t√ºm tablolarƒ± listele
        tables = list(client.list_tables(dataset_ref))

        for table in tables:
            # Tablo referansƒ±
            table_ref = dataset_ref.table(table.table_id)
            table_obj = client.get_table(table_ref)

            table_info.append({
                'Table': table.table_id,
                'Rows': f"{table_obj.num_rows:,}",
                'Size (MB)': f"{table_obj.num_bytes / (1024*1024):.1f}",
                'Type': table_obj.table_type
            })

        # DataFrame olarak g√∂ster
        df_info = pd.DataFrame(table_info)
        print(df_info.to_string(index=False))

        # Toplam istatistik
        total_rows = sum([table_obj.num_rows for table in tables])
        total_size_mb = sum([table_obj.num_bytes for table in tables]) / (1024*1024)

        print(f"\nüìà TOPLAM: {len(tables)} tablo, {total_rows:,} satƒ±r, {total_size_mb:.1f} MB")

    except Exception as e:
        print(f"Hata: {e}")
        return None

    return df_info

# Doƒürulama √ßalƒ±≈ütƒ±r
print("\n" + "="*50)
print("TABLO DOƒûRULAMA √áALI≈ûTIRILIYOR...")
print("="*50)
df_tables = verify_tables()

## **RELEASE THE KRAKEN**

In [None]:
# T√ºm i≈ülemi ba≈ülat
print("üöÄ BIGQUERY VERƒ∞ Y√úKLEME BA≈ûLIYOR...")
print(f"üìÇ Project: {PROJECT_ID}")
print(f"üìÇ Dataset: {DATASET_ID}")
print(f"üìÅ Path: {BASE_PATH}")
print("=" * 50)

try:
    # 1. T√ºm tablolarƒ± y√ºkle
    print("\nüì¶ 1. TABLO Y√úKLEME ƒ∞≈ûLEMƒ∞")
    print("-" * 30)
    load_all_tables()

    print("\n‚úÖ TABLO Y√úKLEME TAMAMLANDI")
    print("=" * 50)

    # 2. View'larƒ± olu≈ütur
    print("\nüîó 2. VIEW OLU≈ûTURMA ƒ∞≈ûLEMƒ∞")
    print("-" * 30)
    create_combined_views()

    print("\n‚úÖ VIEW OLU≈ûTURMA TAMAMLANDI")
    print("=" * 50)

    # 3. Doƒürulama
    print("\nüîç 3. DOƒûRULAMA ƒ∞≈ûLEMƒ∞")
    print("-" * 30)
    df_tables = verify_tables()

    print("\n" + "="*50)
    print("üéâ PROJE HAZIR! NATURE ANALƒ∞ZLERƒ∞NE BA≈ûLAYABƒ∞Lƒ∞RSƒ∞Nƒ∞Z.")
    print("="*50)

    # Ek bilgiler
    print("\nüìä HAZIR TABLOLAR:")
    print("   1. traffic_all_days - T√ºm trafik verisi")
    print("   2. movement_all_days - T√ºm hareket verisi")
    print("   3. daily_summary - G√ºnl√ºk trafik √∂zeti")
    print("   4. province_movement_summary - Eyalet hareket √∂zeti")
    print("   5. census_data - Demografik veri")
    print("   6. grid_locations - Milano grid h√ºcreleri")
    print("   7. province_boundaries - ƒ∞talya eyalet sƒ±nƒ±rlarƒ±")

except Exception as e:
    print(f"\n‚ùå HATA OLU≈ûTU: {e}")
    print("\nHata detayƒ±:")
    import traceback
    traceback.print_exc()

 *  *BigQuery'de 21 tablo olu≈üturuldu ve veri y√ºklendi*
 *  *4 birle≈üik view olu≈üturuldu (traffic_all_days, movement_all_days,daily_summary, province_movement_summary)*
 *  *Nature metodolojisine tam uyumlu veri yapƒ±sƒ± kuruldu*
 *  *Ger√ßek veri seti olduƒüu ke≈üfedildi (sentetik deƒüil, Telefonica Milano 2013 verisi)*

# **Summarise : G√ºn 1 (02.12.2025 Salƒ±): PROJE HAZIRLIƒûI VE VERƒ∞Yƒ∞ ANLAMA - TAMAMLANDI**

* ‚úì Makale ve veri seti incelemesi tamamlanacak
* ‚úì Proje hedefleri netle≈ütirilecek
* ‚úì √áalƒ±≈üma ortamƒ± kurulacak