In [1]:
import os
import sys
import pandas as pd
import argparse
from datetime import datetime
import pathlib

# # Setup Django environment
# # Alternative 1: Use absolute path construction
# current_dir = os.path.abspath('')
# parent_dir = os.path.dirname(current_dir)
# sys.path.insert(0, parent_dir)

# # Alternative 2: Use pathlib for more modern path handling
# parent_path = pathlib.Path().absolute().parent
# sys.path.insert(0, str(parent_path))

# 新增：將上一層目錄加入 sys.path
parent_path = pathlib.Path().absolute().parent
sys.path.insert(0, str(parent_path))

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'website_configs.settings')
import django
django.setup()
# 重要：設定環境變數以允許在 Jupyter 的異步環境中執行同步操作
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Now we can import Django models
from app_user_keyword_db.models import NewsData

In [4]:
# Read CSV file
csv_file_path = '../app_user_keyword/dataset/ttv_news_preprocessed.csv'
# csv_file_path = '../app_user_keyword/dataset/cna_news_preprocessed_12weeks.csv'
df = pd.read_csv(csv_file_path, sep='|')
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link,sentiment2
0,%E6%94%BF%E6%B2%BB_20250407_1,2025-04-07,政治,關稅風暴哀鴻遍野！蔣萬安致電3首長　9日開「基北...,美國對等關稅來勢洶洶，衝擊北部科技重鎮，台北市長蔣萬安今（7）日下午率隊拜會產業界了解需求，...,0.89,暫無,"[('關稅', 5), ('蔣萬安', 5), ('首長', 5), ('科技', 4), ...","['美國', '對等', '關稅', '來勢洶洶', '，', '衝擊', '北部', '科...","['美國', '關稅', '科技', '重鎮', '台北', '市長', '蔣萬安', '產...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('對等', 'A'), ('關稅', 'Na'), ('來勢...",https://news.ttv.com.tw/news/11404070013100I,https://cdn.ttv.com.tw/manasystem/FileData/New...,


In [5]:

# Process each row and create a NewsData object
for idx, row in df.iterrows():
    try:
        # Convert date string to datetime object
        date_obj = datetime.strptime(row['date'], '%Y-%m-%d').date()

        # Create or update NewsData object
        news_data, created = NewsData.objects.update_or_create(
            item_id=row['item_id'],
            defaults={
                'date': date_obj,
                'category': row['category'],
                'title': row['title'],
                'content': row['content'],
                'sentiment': row['sentiment'],
                #'summary': row['summary'],
                'top_key_freq': row['top_key_freq'],
                'tokens': row['tokens'],
                'tokens_v2': row['tokens_v2'],
                'entities': row['entities'],
                'token_pos': row['token_pos'],
                'link': row['link'],
                'photo_link': row['photo_link'] if row['photo_link'] != "" and not pd.isna(row['photo_link']) else None,
            }
        )
        if created:
            print(f"Created new NewsData object with item_id: {row['item_id']}")
        else:
            print(f"Updated existing NewsData object with item_id: {row['item_id']}")
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        print(row)
# photo_link 欄位的值可能為以下幾種情況：
# 實際有值的 URL 字串
# 空字串 ("")
# Pandas NaN 值（當 CSV 檔案中該欄位為空時）
# None 值        

Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250407_1
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250407_2
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250407_3
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250406_4
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250406_5
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250406_6
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250406_7
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250406_8
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250406_9
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250405_10
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250405_11
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250405_12
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB_20250405_13
Created new NewsData object with item_id: %E6%94%BF%E6%B2%BB

In [6]:
created

True