In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
df = pd.read_csv('E:/MBTI 500.csv/MBTI 500.csv')  # Replace with your CSV file path

# Check the original class distribution
print("Original class distribution:", Counter(df['type']))

# Use the original index as a feature (to avoid generating synthetic samples)
df['index'] = df.index
X = df[['index']]  # Use index as placeholder feature
y = df['type']     # Labels

# Define sampling strategy: target ~5000 samples per class
target_count = 1000
sampling_strategy = {label: target_count for label in y.unique()}

# Create a resampling pipeline: oversampling first, then undersampling
over = RandomOverSampler(sampling_strategy='auto', random_state=42)
under = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
pipeline = Pipeline(steps=[('over', over), ('under', under)])

# Apply resampling
X_resampled, y_resampled = pipeline.fit_resample(X, y)

# Check resampled class distribution
print("Resampled class distribution:", Counter(y_resampled))

# Get original indices from X_resampled
resampled_indices = X_resampled['index']

# Use original indices to extract 'posts' and 'type' from df
resampled_df = df.loc[resampled_indices, ['posts', 'type']].reset_index(drop=True)

# Save the result to a new CSV file
resampled_df.to_csv('balanced_dataset2.csv', index=False)
print("Balanced dataset saved as 'balanced_dataset2.csv'")

# Count MBTI type distribution
mbti_counts = resampled_df['type'].value_counts()

# Print distribution result
print(mbti_counts)

# Plot a bar chart
plt.figure(figsize=(10, 6))
mbti_counts.plot(kind='bar')
plt.title('MBTI Posts Count')
plt.xlabel('MBTI Type')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45)
plt.show()

# Load the balanced dataset
df = pd.read_csv('balanced_dataset2.csv')
# Drop NaN values
df = df.dropna(subset=['type'])

# Count for each MBTI category dimension
categories = ['I/E', 'N/S', 'T/F', 'J/P']
counts = {category: {'I': 0, 'E': 0, 'N': 0, 'S': 0, 'T': 0, 'F': 0, 'J': 0, 'P': 0} for category in categories}

for index, row in df.iterrows():
    mbti_type = row['type']
    counts['I/E'][mbti_type[0]] += 1
    counts['N/S'][mbti_type[1]] += 1
    counts['T/F'][mbti_type[2]] += 1
    counts['J/P'][mbti_type[3]] += 1

# Convert count dictionary to DataFrame for plotting
df_counts = pd.DataFrame(counts)

# Plot category-wise bar chart
df_counts.plot(kind='bar', figsize=(10, 6))
plt.title('MBTI Category Counts')
plt.xlabel('Category')
plt.ylabel('Number of Posts')
plt.xticks(rotation=0)  # Do not rotate x-axis labels
plt.show()