## Code to determine and Correct the Class-Imbalnce problem through Bootstrapping

In [1]:
import csv

# Path to the CSV file
file_path = 'tab_data.csv'

# Initialize an empty set to store unique classes
unique_classes = set()

# Open the CSV file and read it
with open(file_path, mode='r') as file:
    csv_reader = csv.DictReader(file)
    
    # Iterate through each row in the CSV
    for row in csv_reader:
        # Extract the value from the "clusterid - Prediction/OutPut" column
        class_value = row['clusterid - Prediction/OutPut']
        # Add the class value to the set of unique classes
        unique_classes.add(class_value)

# Print the number of unique classes
print(f'Number of unique classes: {len(unique_classes)}')

# Print the unique classes and their counts
for unique_class in unique_classes:
    print(unique_class)

Number of unique classes: 6
4
1
2
3
6
5


In [2]:
import csv
from collections import Counter

# Path to the CSV file
file_path = 'tab_data.csv'

# Initialize a Counter to store unique classes and their counts
class_counts = Counter()

# Open the CSV file and read it
with open(file_path, mode='r') as file:
    csv_reader = csv.reader(file)
    
    # Skip the header row if there is one
    next(csv_reader, None)
    
    # Iterate through each row in the CSV
    for row in csv_reader:
        # Assuming the "clusterid - Prediction/OutPut" column is the first column
        cluster_id = row[0]
        # Update the Counter with the cluster_id
        class_counts[cluster_id] += 1

# Print the unique classes and their counts
for class_id, count in class_counts.items():
    print(f'Class {class_id}: {count}')

Class 5: 96
Class 3: 1133
Class 6: 1036
Class 4: 130
Class 1: 32
Class 2: 49


In [3]:
import csv
import random
from collections import defaultdict

# Path to the original and new CSV files
original_file_path = 'tab_data.csv'
new_file_path = 'Modified_tab_data.csv'

# Read the original CSV file
with open(original_file_path, mode='r') as file:
    csv_reader = csv.reader(file)
    header = next(csv_reader)  # Capture the header
    data = list(csv_reader)

# Separate data by class
data_by_class = defaultdict(list)
for row in data:
    class_label = row[0]  # Assuming class labels are in the first column
    data_by_class[class_label].append(row)

# Perform bootstrapping to ensure each class occurs 1200 times
bootstrapped_data = []
target_count = 1200
for class_label, rows in data_by_class.items():
    if len(rows) >= target_count:
        bootstrapped_data.extend(random.sample(rows, target_count))
    else:
        bootstrapped_data.extend(rows)
        additional_rows_needed = target_count - len(rows)
        bootstrapped_data.extend(random.choices(rows, k=additional_rows_needed))

# Shuffle the bootstrapped data to mix the classes well
random.shuffle(bootstrapped_data)

# Write the bootstrapped data to a new CSV file
with open(new_file_path, mode='w', newline='') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(header)  # Write the header first
    csv_writer.writerows(bootstrapped_data)

print(f'New file "{new_file_path}" created with each class occurring 1200 times.')

New file "Modified_tab_data.csv" created with each class occurring 1200 times.


In [4]:
import csv
from collections import Counter

# Path to the CSV file
file_path = 'Modified_tab_data.csv'

# Initialize a Counter to store unique classes and their counts
class_counts = Counter()

# Open the CSV file and read it
with open(file_path, mode='r') as file:
    csv_reader = csv.reader(file)
    
    # Skip the header row if there is one
    next(csv_reader, None)
    
    # Iterate through each row in the CSV
    for row in csv_reader:
        # Assuming the "clusterid - Prediction/OutPut" column is the first column
        cluster_id = row[0]
        # Update the Counter with the cluster_id
        class_counts[cluster_id] += 1

# Print the unique classes and their counts
for class_id, count in class_counts.items():
    print(f'Class {class_id}: {count}')

Class 1: 1200
Class 6: 1200
Class 5: 1200
Class 2: 1200
Class 4: 1200
Class 3: 1200
