In [1]:
# Cell 1: Import các thư viện cần thiết

import json
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.notebook import tqdm
import pandas as pd
import time
import asyncio

# Cấu hình hiển thị
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100
sns.set_theme(style="whitegrid")

In [2]:
# Cell 2: Hàm đọc file .jsonl
def read_jsonl(file_path, max_records=None): 
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f, desc=f"Reading {os.path.basename(file_path)}")):
            if max_records and i >= max_records:
                break
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"JSON error at line {i}: {e}")
    return data


In [3]:
# Cell 3: Đường dẫn tới thư mục chứa các file JSONL
folder_path = r'G:\Phân tích CTG\New\BTL\data'

# Lấy danh sách file .jsonl
jsonl_files = [f for f in os.listdir(folder_path) if f.endswith('.jsonl')]

# Kiểm tra file
print("Các file JSONL có trong thư mục:")
for f in jsonl_files:
    print("-", f)


Các file JSONL có trong thư mục:
- enwiki_namespace_0_0.jsonl
- enwiki_namespace_0_1.jsonl


### Lấy file

#### Hàm skip dòng

In [4]:
def read_jsonl(filepath, max_records=None, skip=0):
    records = []
    with open(filepath, 'r', encoding='utf-8') as f:
        # Bỏ qua số dòng đầu tiên nếu cần
        for _ in range(skip):
            next(f, None)

        # Đọc số lượng dòng mong muốn
        for i, line in enumerate(f):
            if max_records is not None and i >= max_records:
                break
            try:
                records.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return records


#### Đọc file

In [5]:
# === Phần 1: Đọc .jsonl và ghi thành các file CSV (chưa lọc cột) ===
import os
import json
import pandas as pd

# Hàm đọc từng batch từ file JSONL
def read_jsonl(path, max_records, skip):
    results = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < skip:
                continue
            if i >= skip + max_records:
                break
            try:
                results.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Lỗi dòng {i + 1}: {e}")
    return results

# Cấu hình đường dẫn
folder_path = r'G:\Phân tích CTG\New\BTL\data'
jsonl_file = 'enwiki_namespace_0_0.jsonl'
file_to_read = os.path.join(folder_path, jsonl_file)

# Tạo thư mục lưu file CSV thô
raw_base_folder = "Doc_file_Jsonl"
raw_sub_folder = "File_tho"
raw_output_dir = os.path.join(raw_base_folder, raw_sub_folder)
os.makedirs(raw_output_dir, exist_ok=True)

# Cấu hình batch
records_per_file = 15000
batch_number = 1
skip = 0
csv_raw_files = []

while True:
    data = read_jsonl(file_to_read, max_records=records_per_file, skip=skip)
    if not data:
        break
    
    df = pd.DataFrame(data)
    output_path = os.path.join(raw_output_dir, f"File_tho_batch_{batch_number}.csv")
    df.to_csv(output_path, index=False)
    csv_raw_files.append(output_path)
    print(f"✅ Ghi {len(df)} dòng vào {output_path}")
    
    batch_number += 1
    skip += records_per_file

if not csv_raw_files:
    print("❌ Không có dữ liệu được đọc.")
else:
    print(f"🔥 Tổng số file thô đã tạo: {len(csv_raw_files)}")


✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_1.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_2.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_3.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_4.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_5.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_6.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_7.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_8.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_9.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_10.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_11.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_12.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_13.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_14.csv
✅ Ghi 15000 dòng vào Doc_file_Jsonl\File_tho\File_tho_batch_15.csv
✅ Gh

In [6]:
# === Phần 2: Lọc dữ liệu từ CSV thô và ghi ra CSV mới ===
filtered_base_folder = "Doc_file_Jsonl"
filtered_sub_folder = "File_loc"
filtered_output_dir = os.path.join(filtered_base_folder, filtered_sub_folder)
os.makedirs(filtered_output_dir, exist_ok=True)

columns_to_keep = ['name', 'abstract', 'sections']
csv_filtered_files = []
filtered_batch = 1

for raw_file in csv_raw_files:
    df = pd.read_csv(raw_file)
    
    # Lọc các cột cần thiết
    df_filtered = df[columns_to_keep].copy()
    df_filtered = df_filtered.dropna(subset=['name'])
    df_filtered = df_filtered[df_filtered['name'].str.strip() != '']
    
    # Ghi ra CSV mới
    output_path = os.path.join(filtered_output_dir, f"File_loc_batch_{filtered_batch}.csv")
    df_filtered.to_csv(output_path, index=False)
    csv_filtered_files.append(output_path)
    
    print(f"✅ Đã lưu {len(df_filtered)} dòng đã lọc vào: {output_path}")
    filtered_batch += 1

print(f"🎉 Hoàn tất lọc! Tổng số file đã tạo: {len(csv_filtered_files)}")


✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_1.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_2.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_3.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_4.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_5.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_6.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_7.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_8.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_9.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_10.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_11.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_12.csv
✅ Đã lưu 15000 dòng đã lọc vào: Doc_file_Jsonl\File_loc\File_loc_batch_13

### Tạo file crawl

In [1]:
# ==== Cấu hình ====
START_DATE = '20200101'
END_DATE = '20241231'
LANG = 'en'
MAX_CONCURRENT = 12
REQUESTS_PER_MINUTE = 100
BATCH_SIZE = 100
# ===================

import os
import time
import aiohttp
import asyncio
import pandas as pd
import gc
from aiohttp import TCPConnector
from tqdm.asyncio import tqdm

# Bộ giới hạn tốc độ
class RateLimiter:
    def __init__(self, calls: int, period: float):
        self.calls = calls
        self.period = period
        self.requests = []
    
    async def __aenter__(self):
        now = time.time()
        self.requests = [t for t in self.requests if now - t < self.period]
        if len(self.requests) >= self.calls:
            await asyncio.sleep(self.period - (now - self.requests[0]))
        self.requests.append(time.time())
        return self
    
    async def __aexit__(self, exc_type, exc, tb):
        pass

async def get_pageviews(session: aiohttp.ClientSession, article_title: str, start_date: str, end_date: str, lang: str, max_retry: int = 3) -> list:
    article_slug = article_title.replace(" ", "_")
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{lang}.wikipedia.org/all-access/user/{article_slug}/daily/{start_date}/{end_date}"
    headers = {"User-Agent": "MyWikiCrawler/1.0 (+mailto:cgdlddhm@gmail.com)"}
    retries = 0
    while retries < max_retry:
        async with RateLimiter(calls=REQUESTS_PER_MINUTE, period=60):
            try:
                async with session.get(url, headers=headers, timeout=10) as response:
                    if response.status == 200:
                        data = (await response.json()).get('items', [])
                        return [{
                            'title': article_title,
                            'date': item['timestamp'][:8],
                            'views': item['views']
                        } for item in data]
                    elif response.status == 429:
                        wait_time = int(response.headers.get("Retry-After", 10))
                        await asyncio.sleep(wait_time)
                        retries += 1
                    else:
                        return None
            except Exception as e:
                await asyncio.sleep(5)
                retries += 1
    return None

async def crawl_one(session: aiohttp.ClientSession, title: str) -> tuple:
    result = await get_pageviews(session, title, start_date=START_DATE, end_date=END_DATE, lang=LANG)
    return title, result

async def crawl_for_csv(csv_file: str, batch_number: int):
    crawl_output_dir = os.path.join("Crawl(2)", "Crawl_raw")
    os.makedirs(crawl_output_dir, exist_ok=True)

    output_csv = os.path.join(crawl_output_dir, f"Crawl_raw_batch_{batch_number}.csv")

    try:
        df_filtered = pd.read_csv(csv_file)
    except Exception as e:
        return
    
    titles = df_filtered['name'].dropna().unique().tolist()
    if not titles:
        return

    connector = TCPConnector(limit=MAX_CONCURRENT, force_close=True)
    semaphore = asyncio.Semaphore(MAX_CONCURRENT)

    async with aiohttp.ClientSession(connector=connector) as session:
        async def limited_crawl(title):
            async with semaphore:
                return await crawl_one(session, title)

        tasks = [limited_crawl(title) for title in titles]
        results = []

        for i, future in enumerate(tqdm(asyncio.as_completed(tasks), total=len(tasks), desc=f"Crawling {csv_file}")):
            title, result = await future
            if result:
                results.append(pd.DataFrame(result))

            if (i + 1) % BATCH_SIZE == 0 or i == len(tasks) - 1:
                if results:
                    batch_df = pd.concat(results, ignore_index=True)
                    if os.path.exists(output_csv):
                        batch_df.to_csv(output_csv, mode='a', index=False, header=False)
                    else:
                        batch_df.to_csv(output_csv, index=False)

                    del results
                    del batch_df
                    results = []
                    gc.collect()

    print(f"✅ Crawl xong {csv_file}!")

async def main():
    filtered_dir = os.path.join("Doc_file_Jsonl", "File_loc")
    if not os.path.exists(filtered_dir):
        return

    csv_files = [os.path.join(filtered_dir, f) for f in os.listdir(filtered_dir) if f.endswith(".csv")]
    if not csv_files:
        return
    
    for batch_number, csv_file in enumerate(csv_files, 1):
        print(f"\n📄 Processing CSV file: {csv_file}")
        await crawl_for_csv(csv_file, batch_number)
    
    print("✅ Crawl xong toàn bộ các file CSV!")

# === Chạy trong Jupyter Notebook hoặc Script ===
await main()



📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_1.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_1.csv: 100%|██████████| 14941/14941 [19:58<00:00, 12.47it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_1.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_10.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_10.csv: 100%|██████████| 15000/15000 [24:58<00:00, 10.01it/s] 


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_10.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_11.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_11.csv: 100%|██████████| 15000/15000 [23:19<00:00, 10.72it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_11.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_12.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_12.csv: 100%|██████████| 15000/15000 [21:04<00:00, 11.87it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_12.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_13.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_13.csv: 100%|██████████| 15000/15000 [19:57<00:00, 12.53it/s] 


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_13.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_14.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_14.csv: 100%|██████████| 15000/15000 [18:57<00:00, 13.18it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_14.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_15.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_15.csv: 100%|██████████| 15000/15000 [19:52<00:00, 12.58it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_15.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_16.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_16.csv: 100%|██████████| 15000/15000 [24:41<00:00, 10.12it/s] 


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_16.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_17.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_17.csv: 100%|██████████| 15000/15000 [23:29<00:00, 10.64it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_17.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_18.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_18.csv: 100%|██████████| 15000/15000 [23:29<00:00, 10.64it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_18.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_19.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_19.csv: 100%|██████████| 15000/15000 [23:30<00:00, 10.63it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_19.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_2.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_2.csv: 100%|██████████| 15000/15000 [23:47<00:00, 10.51it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_2.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_20.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_20.csv: 100%|██████████| 15000/15000 [23:30<00:00, 10.63it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_20.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_21.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_21.csv: 100%|██████████| 9509/9509 [14:52<00:00, 10.65it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_21.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_3.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_3.csv: 100%|██████████| 15000/15000 [23:19<00:00, 10.72it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_3.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_4.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_4.csv: 100%|██████████| 15000/15000 [23:23<00:00, 10.69it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_4.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_5.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_5.csv: 100%|██████████| 15000/15000 [23:31<00:00, 10.63it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_5.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_6.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_6.csv: 100%|██████████| 15000/15000 [23:19<00:00, 10.72it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_6.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_7.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_7.csv: 100%|██████████| 15000/15000 [23:14<00:00, 10.76it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_7.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_8.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_8.csv: 100%|██████████| 15000/15000 [23:02<00:00, 10.85it/s]


✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_8.csv!

📄 Processing CSV file: Doc_file_Jsonl\File_loc\File_loc_batch_9.csv


Crawling Doc_file_Jsonl\File_loc\File_loc_batch_9.csv: 100%|██████████| 15000/15000 [23:08<00:00, 10.80it/s]

✅ Crawl xong Doc_file_Jsonl\File_loc\File_loc_batch_9.csv!
✅ Crawl xong toàn bộ các file CSV!



