<a href="https://colab.research.google.com/github/41371125h-chinrouzhen/DSCP-Final-Project/blob/main/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np

try:
    # 1. Load Dataset
    df = pd.read_csv('mbti_1.csv')

    # 2. Feature Engineering (Text Analysis)
    # Calculate word count (Activity Level)
    df['word_count'] = df['posts'].apply(lambda x: len(x.split(' ')))

    # Calculate punctuation usage (Emotion Indicators)
    df['exclaim_count'] = df['posts'].apply(lambda x: x.count('!'))
    df['question_count'] = df['posts'].apply(lambda x: x.count('?'))

    # Calculate link usage (Sharing behavior)
    df['link_count'] = df['posts'].apply(lambda x: x.count('http'))

    # 3. Dimensionality Reduction
    # Create binary categories for analysis
    df['E/I'] = df['type'].apply(lambda x: 'Extraversion (E)' if 'E' in x else 'Introversion (I)')
    df['T/F'] = df['type'].apply(lambda x: 'Thinking (T)' if 'T' in x else 'Feeling (F)')

    print("✅ Data Loaded and Processed Successfully.")

except FileNotFoundError:
    print("❌ Error: 'mbti_1.csv' not found. Please upload the file.")

✅ Data Loaded and Processed Successfully.


In [24]:
# ==========================================
# DATASET OVERVIEW & STATISTICS
# ==========================================

# 1. Dataset Shape
print("="*60)
print(f"【DATASET OVERVIEW】")
print(f"Total Users: {df.shape[0]}")
print(f"Total Features: {df.shape[1]}")
print(f"Columns: {list(df.columns)}")
print("="*60)

# 2. Population Stats
print(f"【POPULATION DISTRIBUTION】")
print(df['type'].value_counts().to_string())
print("-" * 30)
print(f"Most Common: {df['type'].value_counts().idxmax()} ({df['type'].value_counts().max()} users)")
print(f"Least Common: {df['type'].value_counts().idxmin()} ({df['type'].value_counts().min()} users)")

# 3. Behavioral Stats
print("="*60)
print(f"【ACTIVITY & BEHAVIOR STATISTICS】")
stats_df = df[['word_count', 'link_count', 'exclaim_count', 'question_count']]
stats_df.columns = ['Word Count', 'Link Count', 'Exclamation (!)', 'Question (?)']
print(stats_df.describe().round(1))
print("="*60)

【DATASET OVERVIEW】
Total Users: 8675
Total Features: 8
Columns: ['type', 'posts', 'word_count', 'exclaim_count', 'question_count', 'link_count', 'E/I', 'T/F']
【POPULATION DISTRIBUTION】
type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
------------------------------
Most Common: INFP (1832 users)
Least Common: ESTJ (39 users)
【ACTIVITY & BEHAVIOR STATISTICS】
       Word Count  Link Count  Exclamation (!)  Question (?)
count      8675.0      8675.0           8675.0        8675.0
mean       1262.7         3.3              8.5          10.7
std         317.3         5.8             11.6           7.0
min           4.0         0.0              0.0           0.0
25%        1081.0         0.0              2.0           6.0
50%        1314.0         1.0              5.0          10.0
75%        1497.0         4.0             11.0    

In [25]:
import plotly.express as px

# ==========================================
# VISUALIZATION GENERATION
# ==========================================

# --- Chart 1: Ecological Distribution (Population) ---
pop_data = df['type'].value_counts().sort_values(ascending=True)

fig1 = px.bar(x=pop_data.values, y=pop_data.index, orientation='h',
              title="Fig 1: Ecological Distribution (Introverts vs Extroverts)",
              labels={'x': 'User Count', 'y': 'Personality Type'},
              template="plotly_white")
fig1.update_traces(marker_color='#2A9D8F') # Teal color
fig1.show()

# --- Chart 2: Activity Level (E vs I) ---
fig2 = px.box(df, x='E/I', y='word_count',
              title="Fig 2: Activity Level (Word Count Distribution)",
              labels={'word_count': 'Total Words', 'E/I': 'Personality Dimension'},
              color='E/I',
              template="plotly_white",
              color_discrete_map={'Introversion (I)': '#264653', 'Extraversion (E)': '#E9C46A'})
fig2.show()

# --- Chart 3: Logic vs Emotion (Punctuation) ---
# Prepare data for grouped bar chart
tf_stats = df.groupby('T/F')[['exclaim_count', 'question_count']].mean().reset_index()
tf_melted = tf_stats.melt(id_vars='T/F', var_name='Symbol', value_name='Count')

# Rename symbols for better legend readability
tf_melted['Symbol'] = tf_melted['Symbol'].replace({
    'exclaim_count': 'Exclamation (!)',
    'question_count': 'Question (?)'
})

fig3 = px.bar(tf_melted, x='T/F', y='Count', color='Symbol', barmode='group',
              title="Fig 3: Logic vs Emotion (Punctuation Usage)",
              template="plotly_white",
              color_discrete_map={'Exclamation (!)': '#E76F51', 'Question (?)': '#2A9D8F'})
fig3.show()