# MentalChat16K - Data Exploration
## CMPE 255 - Data Mining Assignment

This notebook explores the MentalChat16K dataset for conversational mental health assistance.

In [None]:
# Install required packages
!pip install datasets pandas matplotlib seaborn wordcloud

In [None]:
# Import libraries
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Load the Dataset

In [None]:
# Load MentalChat16K from HuggingFace
dataset = load_dataset("ShenLab/MentalChat16K")
print(f"Dataset loaded: {dataset}")

In [None]:
# Convert to pandas DataFrame
df = pd.DataFrame(dataset['train'])
print(f"Shape: {df.shape}")
df.head()

## 2. Basic Statistics

In [None]:
# Column info
print("Columns:", df.columns.tolist())
print("\nData Types:")
print(df.dtypes)

In [None]:
# Calculate word counts
df['input_words'] = df['input'].apply(lambda x: len(str(x).split()))
df['output_words'] = df['output'].apply(lambda x: len(str(x).split()))

print("Input (Question) Statistics:")
print(df['input_words'].describe())
print("\nOutput (Response) Statistics:")
print(df['output_words'].describe())

## 3. Visualizations

In [None]:
# Dataset Composition
labels = ['Synthetic Data\n(9,775 pairs)', 'Interview Data\n(6,338 pairs)']
sizes = [9775, 6338]
colors = ['#3498db', '#2ecc71']

fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
       shadow=True, startangle=90, textprops={'fontsize': 12})
ax.set_title('MentalChat16K Dataset Composition\n(Total: 16,113 QA Pairs)', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../images/dataset_composition.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Word length distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['input_words'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Input (Question) Lengths')
axes[0].set_xlabel('Number of Words')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['input_words'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["input_words"].mean():.1f}')
axes[0].legend()

axes[1].hist(df['output_words'], bins=50, color='forestgreen', edgecolor='black', alpha=0.7)
axes[1].set_title('Distribution of Output (Response) Lengths')
axes[1].set_xlabel('Number of Words')
axes[1].set_ylabel('Frequency')
axes[1].axvline(df['output_words'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["output_words"].mean():.1f}')
axes[1].legend()

plt.tight_layout()
plt.savefig('../images/length_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Word Cloud for Questions
text = ' '.join(df['input'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white', 
                      colormap='viridis', max_words=100).generate(text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Mental Health Questions', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../images/wordcloud_questions.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Sample Conversations

In [None]:
# Display sample conversations
for i in range(3):
    print(f"\n{'='*60}")
    print(f"SAMPLE {i+1}")
    print(f"{'='*60}")
    print(f"\nQuestion: {df.iloc[i]['input'][:500]}...")
    print(f"\nResponse: {df.iloc[i]['output'][:500]}...")

## 5. Evaluation Metrics Visualization

In [None]:
# 7 Therapeutic Evaluation Metrics (illustrative data)
metrics = [
    'Active Listening',
    'Empathy & Validation',
    'Safety & Trustworthiness',
    'Open-mindedness',
    'Clarity & Encouragement',
    'Boundaries & Ethics',
    'Holistic Approach'
]

base_scores = [6.2, 6.5, 6.0, 6.3, 6.4, 6.1, 6.2]
finetuned_scores = [8.1, 8.3, 7.9, 8.0, 8.2, 7.8, 8.0]

x = range(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar([i - width/2 for i in x], base_scores, width, 
               label='Base Model', color='#e74c3c', alpha=0.8)
bars2 = ax.bar([i + width/2 for i in x], finetuned_scores, width, 
               label='Fine-tuned Model', color='#27ae60', alpha=0.8)

ax.set_ylabel('Score (1-10)', fontsize=12)
ax.set_title('Model Performance Across 7 Therapeutic Metrics', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics, rotation=45, ha='right', fontsize=10)
ax.legend()
ax.set_ylim(0, 10)

plt.tight_layout()
plt.savefig('../images/evaluation_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Summary

### Key Findings:
1. **Dataset Size**: 16,113 QA pairs total
2. **Composition**: 60.7% synthetic, 39.3% real interview data
3. **Coverage**: 33 mental health topics
4. **Quality**: Evaluated across 7 therapeutic metrics

### Significance for Data Mining:
- Novel approach to combining real and synthetic data
- Privacy-preserving data collection pipeline
- Multi-evaluator benchmarking framework