# Create 20% Sample Training Dataset

This notebook creates a new training dataset that is 20% of the original dataset.

In [1]:
import pandas as pd
import numpy as np

## Load the Original Dataset

In [2]:
# Load the original training dataset
df = pd.read_csv('dataset/train.csv')
print(f"Original dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

Original dataset shape: (1913, 14)

Columns: ['participant_id', 'age_group', 'identity_code', 'cultural_background', 'upbringing_influence', 'focus_intensity', 'consistency_score', 'external_guidance_usage', 'support_environment_score', 'hobby_engagement_level', 'physical_activity_index', 'creative_expression_index', 'altruism_score', 'personality_cluster']

First few rows:


Unnamed: 0,participant_id,age_group,identity_code,cultural_background,upbringing_influence,focus_intensity,consistency_score,external_guidance_usage,support_environment_score,hobby_engagement_level,physical_activity_index,creative_expression_index,altruism_score,personality_cluster
0,643,18,1,0,4,18.118879,24,1,0,1,1,0,0,Cluster_E
1,1753,16,1,3,2,2.469271,19,0,1,0,0,0,1,Cluster_E
2,1402,17,0,2,3,3.456401,25,0,0,0,0,0,0,Cluster_E
3,2033,17,1,0,1,15.838131,25,1,2,0,0,0,0,Cluster_E
4,991,16,0,0,0,1.045373,19,1,2,0,0,1,0,Cluster_E


## Check Class Distribution

In [3]:
# Check the distribution of personality clusters
print("Original class distribution:")
print(df['personality_cluster'].value_counts())
print(f"\nClass distribution (percentages):")
print(df['personality_cluster'].value_counts(normalize=True) * 100)

Original class distribution:
personality_cluster
Cluster_E    974
Cluster_D    328
Cluster_C    306
Cluster_B    220
Cluster_A     85
Name: count, dtype: int64

Class distribution (percentages):
personality_cluster
Cluster_E    50.914794
Cluster_D    17.145844
Cluster_C    15.995818
Cluster_B    11.500261
Cluster_A     4.443283
Name: proportion, dtype: float64


## Create 20% Stratified Sample

We'll use stratified sampling to maintain the same class distribution in the sampled dataset.

In [4]:
# Set random seed for reproducibility
np.random.seed(42)

# Create a stratified sample (20% of the data)
df_sample = df.groupby('personality_cluster', group_keys=False).apply(
    lambda x: x.sample(frac=0.2, random_state=42)
).reset_index(drop=True)

print(f"Sampled dataset shape: {df_sample.shape}")
print(f"Sample represents {(len(df_sample) / len(df)) * 100:.2f}% of original data")

Sampled dataset shape: (383, 14)
Sample represents 20.02% of original data


  df_sample = df.groupby('personality_cluster', group_keys=False).apply(


## Verify Sample Distribution

In [5]:
# Verify the class distribution in the sample
print("Sampled class distribution:")
print(df_sample['personality_cluster'].value_counts())
print(f"\nSampled class distribution (percentages):")
print(df_sample['personality_cluster'].value_counts(normalize=True) * 100)

Sampled class distribution:
personality_cluster
Cluster_E    195
Cluster_D     66
Cluster_C     61
Cluster_B     44
Cluster_A     17
Name: count, dtype: int64

Sampled class distribution (percentages):
personality_cluster
Cluster_E    50.913838
Cluster_D    17.232376
Cluster_C    15.926893
Cluster_B    11.488251
Cluster_A     4.438642
Name: proportion, dtype: float64


## Compare Original vs Sample

In [6]:
# Create a comparison dataframe
comparison = pd.DataFrame({
    'Original_Count': df['personality_cluster'].value_counts(),
    'Sample_Count': df_sample['personality_cluster'].value_counts(),
    'Original_%': df['personality_cluster'].value_counts(normalize=True) * 100,
    'Sample_%': df_sample['personality_cluster'].value_counts(normalize=True) * 100
})

comparison['Difference_%'] = comparison['Sample_%'] - comparison['Original_%']
print("\nComparison of Original vs Sample:")
print(comparison)


Comparison of Original vs Sample:
                     Original_Count  Sample_Count  Original_%   Sample_%  \
personality_cluster                                                        
Cluster_E                       974           195   50.914794  50.913838   
Cluster_D                       328            66   17.145844  17.232376   
Cluster_C                       306            61   15.995818  15.926893   
Cluster_B                       220            44   11.500261  11.488251   
Cluster_A                        85            17    4.443283   4.438642   

                     Difference_%  
personality_cluster                
Cluster_E               -0.000955  
Cluster_D                0.086532  
Cluster_C               -0.068925  
Cluster_B               -0.012011  
Cluster_A               -0.004641  


## Save the Sampled Dataset

In [7]:
# Save the sampled dataset
output_file = 'dataset/train_20percent.csv'
df_sample.to_csv(output_file, index=False)
print(f"Sampled dataset saved to: {output_file}")
print(f"Total rows in sample: {len(df_sample)}")

Sampled dataset saved to: dataset/train_20percent.csv
Total rows in sample: 383


## Summary Statistics

In [8]:
# Display summary statistics for the sample
print("\nSummary statistics for the 20% sample:")
df_sample.describe()


Summary statistics for the 20% sample:


Unnamed: 0,participant_id,age_group,identity_code,cultural_background,upbringing_influence,focus_intensity,consistency_score,external_guidance_usage,support_environment_score,hobby_engagement_level,physical_activity_index,creative_expression_index,altruism_score
count,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0,383.0
mean,1188.449086,16.425587,0.490862,0.851175,1.754569,10.003206,14.415144,0.284595,2.120104,0.417755,0.326371,0.201044,0.156658
std,683.867549,1.145934,0.50057,1.02139,0.944715,5.803829,8.569853,0.451812,1.133122,0.493834,0.469498,0.401305,0.363953
min,4.0,15.0,0.0,0.0,0.0,0.008031,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,608.5,15.0,0.0,0.0,1.0,5.239126,7.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,1199.0,16.0,0.0,0.0,2.0,9.962593,14.0,0.0,2.0,0.0,0.0,0.0,0.0
75%,1789.5,17.0,1.0,2.0,2.0,14.808663,22.0,1.0,3.0,1.0,1.0,0.0,0.0
max,2387.0,18.0,1.0,3.0,4.0,19.968425,29.0,1.0,4.0,1.0,1.0,1.0,1.0


In [9]:
# Verify the saved file
df_verify = pd.read_csv(output_file)
print(f"\nVerification - Loaded saved file shape: {df_verify.shape}")
print(f"Data integrity check: {df_verify.equals(df_sample)}")


Verification - Loaded saved file shape: (383, 14)
Data integrity check: True
