# 划分训练集和测试集

同济子豪兄 https://space.bilibili.com/1900783

代码运行[云GPU平台](https://featurize.cn/?s=d7ce99f842414bfcaea5662a97581bd1)

2022-7-22

## 导入工具包

In [1]:
import os
import shutil
import random
import pandas as pd

## 获得所有类别名称

In [2]:
# 指定数据集路径
dataset_path = 'D:\dataset\OR'

In [3]:
dataset_name = dataset_path.split('_')[0]
print('数据集', dataset_name)

数据集 D:\dataset\OR


In [4]:
classes = os.listdir(dataset_path)

In [5]:
len(classes)

2

In [6]:
classes

['Parasitized', 'Uninfected']

## 创建训练集文件夹和测试集文件夹

In [14]:
# 创建 train 文件夹
os.mkdir(os.path.join(dataset_path, 'train'))

# 创建 test 文件夹
os.mkdir(os.path.join(dataset_path, 'val'))

# 在 train 和 test 文件夹中创建各类别子文件夹
for fruit in classes:
    os.mkdir(os.path.join(dataset_path, 'train', fruit))
    os.mkdir(os.path.join(dataset_path, 'val', fruit))

## 划分训练集、测试集，移动文件

In [15]:
test_frac = 0.2  # 测试集比例
random.seed(123) # 随机数种子，便于复现

In [9]:
df = pd.DataFrame()

print('{:^18} {:^18} {:^18}'.format('类别', '训练集数据个数', '测试集数据个数'))

for fruit in classes: # 遍历每个类别

    # 读取该类别的所有图像文件名
    old_dir = os.path.join(dataset_path, fruit)
    images_filename = os.listdir(old_dir)
    random.shuffle(images_filename) # 随机打乱

    # 划分训练集和测试集
    testset_numer = int(len(images_filename) * test_frac) # 测试集图像个数
    testset_images = images_filename[:testset_numer]      # 获取拟移动至 test 目录的测试集图像文件名
    trainset_images = images_filename[testset_numer:]     # 获取拟移动至 train 目录的训练集图像文件名

    # 移动图像至 test 目录
    for image in testset_images:
        old_img_path = os.path.join(dataset_path, fruit, image)         # 获取原始文件路径
        new_test_path = os.path.join(dataset_path, 'val', fruit, image) # 获取 test 目录的新文件路径
        shutil.move(old_img_path, new_test_path) # 移动文件

    # 移动图像至 train 目录
    for image in trainset_images:
        old_img_path = os.path.join(dataset_path, fruit, image)           # 获取原始文件路径
        new_train_path = os.path.join(dataset_path, 'train', fruit, image) # 获取 train 目录的新文件路径
        shutil.move(old_img_path, new_train_path) # 移动文件
    
    # 删除旧文件夹
    assert len(os.listdir(old_dir)) == 0 # 确保旧文件夹中的所有图像都被移动走
    shutil.rmtree(old_dir) # 删除文件夹
    
    # 工整地输出每一类别的数据个数
    print('{:^18} {:^18} {:^18}'.format(fruit, len(trainset_images), len(testset_images)))
    
    # 保存到表格中
    df = df.append({'class':fruit, 'trainset':len(trainset_images), 'testset':len(testset_images)}, ignore_index=True)

# 重命名数据集文件夹
shutil.move(dataset_path, dataset_name+'_split')

# 数据集各类别数量统计表格，导出为 csv 文件
df['total'] = df['trainset'] + df['testset']
df.to_csv('数据量统计.csv', index=False)

        类别              训练集数据个数            测试集数据个数      
   Parasitized           11024               2755       


AttributeError: 'DataFrame' object has no attribute 'append'

In [17]:
import os
import shutil
import random
import pandas as pd

# 初始设置
dataset_path = r'D:\dataset\OR'  # 替换为数据集路径
dataset_name = 'or2'  # 替换为数据集名称
test_frac = 0.2  # 测试集所占比例
classes = ['Parasitized', 'Uninfected']  # 替换为实际的类别名称列表

# 用于存储每个类别的统计数据
data_list = []

print('{:^18} {:^18} {:^18}'.format('类别', '训练集数据个数', '测试集数据个数'))

for fruit in classes:
    old_dir = os.path.join(dataset_path, fruit)
    images_filename = os.listdir(old_dir)
    random.shuffle(images_filename)

    testset_number = int(len(images_filename) * test_frac)
    testset_images = images_filename[:testset_number]
    trainset_images = images_filename[testset_number:]

    for image in testset_images:
        old_img_path = os.path.join(old_dir, image)
        new_test_path = os.path.join(dataset_path, 'val', fruit, image)
        os.makedirs(os.path.dirname(new_test_path), exist_ok=True)
        shutil.copy(old_img_path, new_test_path)

    for image in trainset_images:
        old_img_path = os.path.join(old_dir, image)
        new_train_path = os.path.join(dataset_path, 'train', fruit, image)
        os.makedirs(os.path.dirname(new_train_path), exist_ok=True)
        shutil.copy(old_img_path, new_train_path)

    print('{:^18} {:^18} {:^18}'.format(fruit, len(trainset_images), len(testset_images)))

    data_list.append({'class': fruit, 'trainset': len(trainset_images), 'testset': len(testset_images)})

# 创建 DataFrame
df = pd.concat([pd.DataFrame([data]) for data in data_list], ignore_index=True)

# 重命名数据集文件夹
new_dataset_path = dataset_name + '_split'
shutil.move(dataset_path, new_dataset_path)

# 添加总数列和保存 CSV
df['total'] = df['trainset'] + df['testset']
df.to_csv('数据量统计.csv', index=False)



        类别              训练集数据个数            测试集数据个数      
   Parasitized           11024               2755       
    Uninfected           11024               2755       


KeyboardInterrupt: 

In [20]:
df

Unnamed: 0,class,trainset,testset
0,parasitized,11023,2756
1,uninfected,11023,2756


## 查看文件目录结构

In [16]:
!sudo snap install tree

In [27]:
!tree fruit81_split -L 2

[01;34mfruit81_split[00m
├── [01;34mtrain[00m
│   ├── [01;34m人参果[00m
│   ├── [01;34m佛手瓜[00m
│   ├── [01;34m哈密瓜[00m
│   ├── [01;34m圣女果[00m
│   ├── [01;34m山楂[00m
│   ├── [01;34m山竹[00m
│   ├── [01;34m无花果[00m
│   ├── [01;34m木瓜[00m
│   ├── [01;34m李子[00m
│   ├── [01;34m杏[00m
│   ├── [01;34m杨桃[00m
│   ├── [01;34m杨梅[00m
│   ├── [01;34m枇杷[00m
│   ├── [01;34m枣[00m
│   ├── [01;34m柚子[00m
│   ├── [01;34m柠檬[00m
│   ├── [01;34m柿子[00m
│   ├── [01;34m树莓[00m
│   ├── [01;34m桂圆[00m
│   ├── [01;34m桑葚[00m
│   ├── [01;34m梨[00m
│   ├── [01;34m椰子[00m
│   ├── [01;34m榴莲[00m
│   ├── [01;34m樱桃[00m
│   ├── [01;34m橘子[00m
│   ├── [01;34m毛丹[00m
│   ├── [01;34m水蜜桃[00m
│   ├── [01;34m沃柑[00m
│   ├── [01;34m沙果[00m
│   ├── [01;34m沙棘[00m
│   ├── [01;34m油桃[00m
│   ├── [01;34m牛油果[00m
│   ├── [01;34m猕猴桃[00m
│   ├── [01;34m甘蔗[00m
│   ├── [01;34m甜瓜-伊丽莎白[00m
│   ├── [01;34m甜瓜-白[00m
│   ├── [01;34m甜瓜-绿[00m
│   ├── [01;34m甜瓜-金[00m
│   ├── [01;34m番石榴-百