In [2]:
import os
import shutil

def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def move_files(src_folder, dst_folder):
    # Define the new categories
    categories = {
        'less_than_5': {'1', '2', '3', '4'},
        'exactly_5': {'5'},
        'more_than_5': {'6', '7', '8', '9'}
    }
    
    for category, folder_numbers in categories.items():
        target_folder = os.path.join(dst_folder, category)
        create_directory_if_not_exists(target_folder)
    
    for folder in os.listdir(src_folder):
        folder_path = os.path.join(src_folder, folder)
        if folder in {'1', '2', '3', '4', '5', '6', '7', '8', '9'} and os.path.isdir(folder_path):
            if folder in categories['less_than_5']:
                target_folder = os.path.join(dst_folder, 'less_than_5')
            elif folder in categories['exactly_5']:
                target_folder = os.path.join(dst_folder, 'exactly_5')
            elif folder in categories['more_than_5']:
                target_folder = os.path.join(dst_folder, 'more_than_5')
            
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                if os.path.isfile(file_path):
                    shutil.copy(file_path, target_folder)

def process_dataset(src_folder, dst_folder):
    create_directory_if_not_exists(dst_folder)
    move_files(src_folder, dst_folder)

# Define paths
training_data_src = 'training_data/ratings_female'
testing_data_src = 'testing_data/ratings_female'
training_data_dst = 'training_data/ratings_female_simplified'
testing_data_dst = 'testing_data/ratings_female_simplified'

# Process the datasets
process_dataset(training_data_src, training_data_dst)
process_dataset(testing_data_src, testing_data_dst)

print("Data processing complete.")


Data processing complete.


In [1]:
import os
import random
import shutil

# Paths to the existing data
training_data_path = "training_data/ratings_female_simplified/"
testing_data_path = "testing_data/ratings_female_simplified/"

# Paths to the new directories
new_training_data_path = "training_data/ratings_female_simplified_80p/"
new_testing_data_path = "testing_data/ratings_female_simplified_80p/"

# Function to create the new data directories with 80% data
def create_reduced_dataset(source_path, destination_path, reduction_percentage=0.8):
    # Ensure the destination directory exists
    os.makedirs(destination_path, exist_ok=True)
    
    # List the categories within the source directory
    categories = os.listdir(source_path)
    
    for category in categories:
        category_path = os.path.join(source_path, category)
        new_category_path = os.path.join(destination_path, category)
        
        # Make the new category directory
        os.makedirs(new_category_path, exist_ok=True)
        
        # Get all files in the current category
        files = os.listdir(category_path)
        
        # Calculate the number of files to retain (80%)
        num_files_to_retain = int(len(files) * reduction_percentage)
        
        # Randomly select files to retain
        files_to_copy = random.sample(files, num_files_to_retain)
        
        # Copy the selected files to the new directory
        for file in files_to_copy:
            src_file = os.path.join(category_path, file)
            dest_file = os.path.join(new_category_path, file)
            shutil.copy2(src_file, dest_file)

# Create the reduced training and testing datasets
create_reduced_dataset(training_data_path, new_training_data_path)
create_reduced_dataset(testing_data_path, new_testing_data_path)

print("Data reduction complete.")


Data reduction complete.
