# 01_data_acquisition

本笔记本用于执行电影数据的获取操作，包括从Kaggle和pandas GitHub下载电影数据集。

In [1]:
# 导入必要的库
import os
import sys
import pandas as pd
import subprocess

## 1. 检查项目结构

In [2]:
# 检查当前工作目录
print(f'当前工作目录: {os.getcwd()}')

# 检查项目结构
project_dir = '..'
print('项目结构:')
for root, dirs, files in os.walk(project_dir):
    level = root.replace(project_dir, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # 只显示前5个文件
        print(f'{subindent}{file}')
    if len(files) > 5:
        print(f'{subindent}... 等 {len(files) - 5} 个文件')

当前工作目录: C:\羊驼\pro\analysis\movie_box_office_analysis\notebooks
项目结构:
../
  check_encoding.py
  operation_manual.md
  README.md
  requirements.txt
  .ruff_cache/
    .gitignore
    CACHEDIR.TAG
    0.14.8/
      4880803238353314185
  data/
    external/
    processed/
    raw/
      tmdb_5000_credits.csv
      tmdb_5000_movies.csv
      tmdb_merged.csv
  notebooks/
    01_data_acquisition.html
    01_data_acquisition.ipynb
    01_data_acquisition.nbconvert.ipynb
    01_data_acquisition.py
    02_data_preprocessing.ipynb
    ... 等 5 个文件
  results/
    charts/
    models/
    reports/
  src/
    api_deployment.py
    data_acquisition.py
    data_preprocessing.py
    deep_learning.py
    eda_analysis.py
    ... 等 6 个文件
    .ruff_cache/
      .gitignore
      CACHEDIR.TAG
      0.14.8/
        3845085135956808529
    __pycache__/
      api_deployment.cpython-313.pyc
      data_acquisition.cpython-313.pyc


## 2. 执行数据获取脚本

In [3]:
# 执行数据获取脚本
src_dir = os.path.join(project_dir, 'src')
os.chdir(src_dir)

print('正在执行数据获取脚本...')
result = subprocess.run(['python', 'data_acquisition.py', '--skip-download'], capture_output=True, text=True)

# 打印脚本输出
print('脚本输出:')
print('-' * 50)
print(result.stdout)

if result.stderr:
    print('错误信息:')
    print('-' * 50)
    print(result.stderr)

print('返回代码:', result.returncode)

正在执行数据获取脚本...


Exception in thread Thread-3 (_readerthread):
Traceback (most recent call last):
  File [35m"C:\Users\32248\AppData\Local\Programs\Python\Python313\Lib\threading.py"[0m, line [35m1043[0m, in [35m_bootstrap_inner[0m
    [31mself.run[0m[1;31m()[0m
    [31m~~~~~~~~[0m[1;31m^^[0m
  File [35m"C:\Users\32248\AppData\Local\Programs\Python\Python313\Lib\site-packages\ipykernel\ipkernel.py"[0m, line [35m772[0m, in [35mrun_closure[0m
    [31m_threading_Thread_run[0m[1;31m(self)[0m
    [31m~~~~~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^[0m
  File [35m"C:\Users\32248\AppData\Local\Programs\Python\Python313\Lib\threading.py"[0m, line [35m994[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"C:\Users\32248\AppData\Local\Programs\Python\Python313\Lib\subprocess.py"[0m, line [35m1615[0m, in [35m_readerthread[0m
    buffer.append([31mfh.read[0m[1;31m()[0m)
   

脚本输出:
--------------------------------------------------
None
返回代码: 0


## 3. 查看获取的数据

In [4]:
# 查看raw目录下的文件
raw_dir = os.path.join(project_dir, 'data', 'raw')
print('raw目录下的文件:')
for file in os.listdir(raw_dir):
    print(f'  - {file}')

# 查看processed目录下的文件
processed_dir = os.path.join(project_dir, 'data', 'processed')
print('processed目录下的文件:')
for file in os.listdir(processed_dir):
    print(f'  - {file}')

raw目录下的文件:
  - tmdb_5000_credits.csv
  - tmdb_5000_movies.csv
  - tmdb_merged.csv
processed目录下的文件:


## 4. 加载并查看数据示例

In [5]:
# 加载并查看TMDB电影数据
tmdb_movies_path = os.path.join(raw_dir, 'tmdb_5000_movies.csv')
if os.path.exists(tmdb_movies_path):
    tmdb_movies = pd.read_csv(tmdb_movies_path)
    print(f'TMDB电影数据形状: {tmdb_movies.shape}')
    print('\nTMDB电影数据示例:')
    display(tmdb_movies.head())
else:
    print('TMDB电影数据文件不存在')

# 加载并查看TMDB演职员数据
tmdb_credits_path = os.path.join(raw_dir, 'tmdb_5000_credits.csv')
if os.path.exists(tmdb_credits_path):
    tmdb_credits = pd.read_csv(tmdb_credits_path)
    print(f'\nTMDB演职员数据形状: {tmdb_credits.shape}')
    print('\nTMDB演职员数据示例:')
    display(tmdb_credits.head())
else:
    print('TMDB演职员数据文件不存在')

# 加载并查看pandas电影数据
pandas_movies_path = os.path.join(raw_dir, 'movies.csv')
if os.path.exists(pandas_movies_path):
    pandas_movies = pd.read_csv(pandas_movies_path)
    print(f'\npandas电影数据形状: {pandas_movies.shape}')
    print('\npandas电影数据示例:')
    display(pandas_movies.head())
else:
    print('pandas电影数据文件不存在')

TMDB电影数据形状: (4803, 20)

TMDB电影数据示例:


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124



TMDB演职员数据形状: (4803, 4)

TMDB演职员数据示例:


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


pandas电影数据文件不存在


## 5. 结论

In [6]:
# 总结数据获取结果
print('数据获取操作完成！')
print('\n下一步操作:')
print('1. 运行02_data_preprocessing.ipynb进行数据预处理')
print('2. 或直接执行src/data_preprocessing.py脚本')

数据获取操作完成！

下一步操作:
1. 运行02_data_preprocessing.ipynb进行数据预处理
2. 或直接执行src/data_preprocessing.py脚本
