In [8]:
import pandas as pd
import numpy as np

# Load the original data
data = pd.read_csv('Training.csv').drop(columns=['Unnamed: 133'], errors='ignore')

# Display the first few rows to understand the structure
print("Original Data:")
print(data.head())

Original Data:
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0        

In [11]:
# Define the target number of rows per disease category
target_count = 1000

# Create a DataFrame to hold the augmented dataset
augmented_data = pd.DataFrame(columns=data.columns)

# Get a list of unique diseases
unique_diseases = data['prognosis'].unique()

# Loop over each disease category and augment data
for disease in unique_diseases:
    # Filter the data for the current disease
    disease_data = data[data['prognosis'] == disease]
    current_count = len(disease_data)
    
    # Calculate how many additional rows are needed to reach the target count
    additional_rows_needed = target_count - current_count
    
    if additional_rows_needed > 0:
        # Randomly sample rows from the existing data for the disease
        sampled_data = disease_data.sample(n=additional_rows_needed, replace=True, random_state=42).copy()
        
        # Apply small random binary flips to create variations in binary columns
        for column in sampled_data.columns:
            if set(sampled_data[column].unique()) == {0, 1}:
                # Flip a small percentage of binary values
                flip_mask = np.random.rand(len(sampled_data)) < 0.1  # Adjust flip rate as needed (10% here)
                sampled_data.loc[flip_mask, column] = 1 - sampled_data.loc[flip_mask, column]

        # Append the sampled (and slightly modified) data to the disease data
        disease_augmented = pd.concat([disease_data, sampled_data], ignore_index=True)
    else:
        # If no augmentation is needed, retain the original data
        disease_augmented = disease_data

    # Add the augmented data for this disease to the main dataset
    augmented_data = pd.concat([augmented_data, disease_augmented], ignore_index=True)

# Save the augmented dataset to a new CSV file
output_path = 'Augmented_Data.csv'
augmented_data.to_csv(output_path, index=False)

print(f"Data augmentation complete. Augmented dataset saved as {output_path}")

Data augmentation complete. Augmented dataset saved as Augmented_Data.csv
