<a href="https://colab.research.google.com/github/ChiroDeniro/ai-ml-projecten/blob/main/Full_Psychometric_AI_Analysis_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Neural Psychometric Embeddings & Trait Prediction Pipeline
===========================================================

16-11-2025

AI applications in psychometrics, combining:
- Transformer models (BERT, Sentence-BERT)
- Classical ML and Deep Learning
- Interpretable AI (SHAP)
- Interactive deployment (Gradio)

Perfect for demonstrating skills for AI/ML roles.

CV-Ready Skills Demonstrated:
- ü§ó Transformer Models & Transfer Learning
- üìä Advanced Data Visualization (UMAP, t-SNE, Plotly)
- üß† Deep Learning with PyTorch
- üìà Classical ML (XGBoost, Ensemble Methods)
- üîç Model Interpretability (SHAP, Attention Analysis)
- üéØ Psychometric Validation & Statistical Analysis
- üöÄ MLOps & Production Patterns
- üé® Interactive ML Demos
"""

# ============================================================================
# SECTION 0: SETUP & INSTALLATION
# ============================================================================

# Run this in Google Colab first:
"""
!pip install -q transformers==4.35.0
!pip install -q sentence-transformers==2.2.2
!pip install -q datasets==2.14.6
!pip install -q umap-learn==0.5.5
!pip install -q plotly==5.18.0
!pip install -q shap==0.43.0
!pip install -q xgboost==2.0.2
!pip install -q scikit-learn==1.3.2
!pip install -q gradio==4.7.1
!pip install -q seaborn==0.13.0
"""

# ============================================================================
# SECTION 1: IMPORTS
# ============================================================================

import numpy as np
import pandas as pd
import re
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import warnings
import logging
from datetime import datetime
import pickle
import json

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ML & Deep Learning
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
import xgboost as xgb

# Dimensionality reduction
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.manifold import TSNE

# Interpretability
import shap

# Utilities
from tqdm.auto import tqdm

# Configuration
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è  Using device: {device}")

# ============================================================================
# SECTION 2: DATA LOADING
# ============================================================================

def download_dataset():
    """Download the Essays with Big Five personality dataset"""
    import os
    if not os.path.exists('essays.csv'):
        # Try primary source
        os.system('wget -q https://raw.githubusercontent.com/SenticNet/personality-detection/master/essays.csv -O essays.csv')

    if os.path.exists('essays.csv'):
        print("‚úÖ Dataset downloaded!")
        return True
    else:
        print("‚ùå Download failed. Please download manually from:")
        print("   https://github.com/SenticNet/personality-detection/blob/master/essays.csv")
        return False

def load_and_prepare_data():
    """Load and prepare the dataset"""
    df = pd.read_csv('essays.csv', encoding='latin-1')

    # Standardize column names
    df.columns = df.columns.str.strip().str.lower()

    # Identify columns
    text_col = 'text' if 'text' in df.columns else df.columns[0]

    # Rename trait columns for clarity
    trait_mapping = {
        'cext': 'extraversion',
        'cagr': 'agreeableness',
        'ccon': 'conscientiousness',
        'cneuro': 'neuroticism',
        'copn': 'openness'
    }

    df = df.rename(columns=trait_mapping)
    trait_cols = list(trait_mapping.values())

    # Data quality checks
    df['text_length'] = df[text_col].astype(str).apply(len)
    df['word_count'] = df[text_col].astype(str).apply(lambda x: len(x.split()))

    # Clean data
    df_clean = df[df['word_count'] >= 10].copy()
    df_clean = df_clean.dropna(subset=trait_cols)

    print(f"üìä Dataset: {len(df_clean)} samples")
    print(f"üìù Average words: {df_clean['word_count'].mean():.0f}")
    print(f"üéØ Traits: {trait_cols}")

    return df_clean, text_col, trait_cols

# ============================================================================
# SECTION 3: TEXT PREPROCESSING
# ============================================================================

class TextPreprocessor:
    """Handles all text preprocessing operations"""

    def __init__(self, lowercase: bool = True, remove_urls: bool = True):
        self.lowercase = lowercase
        self.remove_urls = remove_urls

    def clean_text(self, text: str) -> str:
        """Basic text cleaning"""
        if not isinstance(text, str):
            return ""

        # Remove URLs
        if self.remove_urls:
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Lowercase
        if self.lowercase:
            text = text.lower()

        return text.strip()

    def extract_linguistic_features(self, text: str) -> Dict[str, float]:
        """
        Extract psychological linguistic features.

        Research shows these correlate with personality:
        - Pronoun usage ‚Üí Self-reference, social orientation
        - Punctuation ‚Üí Emotional expressiveness
        - Word length ‚Üí Cognitive complexity
        """
        words = text.split()
        n_words = len(words)

        if n_words == 0:
            return {}

        features = {
            'word_count': n_words,
            'char_count': len(text),
            'avg_word_length': np.mean([len(w) for w in words]),

            # Pronoun usage (personality markers)
            'first_person_singular': sum(1 for w in words if w in ['i', "i'm", "i've", 'me', 'my', 'mine']) / n_words,
            'first_person_plural': sum(1 for w in words if w in ['we', "we're", "we've", 'us', 'our']) / n_words,
            'second_person': sum(1 for w in words if w in ['you', "you're", 'your']) / n_words,
            'third_person': sum(1 for w in words if w in ['he', 'she', 'they', 'his', 'her', 'their']) / n_words,

            # Punctuation (emotional expressiveness)
            'exclamation_count': text.count('!') / n_words,
            'question_count': text.count('?') / n_words,
            'comma_count': text.count(',') / n_words,
            'period_count': text.count('.') / n_words,

            # Cognitive complexity
            'unique_word_ratio': len(set(words)) / n_words,
        }

        return features

    def process_dataframe(self, df: pd.DataFrame, text_col: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Process entire dataframe and extract features"""
        logger.info("Processing text data...")

        # Clean text
        df['cleaned_text'] = df[text_col].apply(self.clean_text)

        # Extract linguistic features
        features_list = []
        for text in tqdm(df['cleaned_text'], desc="Extracting features"):
            features_list.append(self.extract_linguistic_features(text))

        features_df = pd.DataFrame(features_list)

        logger.info(f"Extracted {len(features_df.columns)} linguistic features")
        return df, features_df

# ============================================================================
# SECTION 4: EMBEDDING GENERATION
# ============================================================================

class EmbeddingGenerator:
    """
    Generates embeddings using pre-trained transformer models.

    Why Sentence-BERT?
    - Pre-trained on semantic similarity tasks
    - Efficient: Single forward pass
    - 384 or 768-dimensional semantic vectors
    - State-of-the-art for sentence-level tasks
    """

    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize embedding model.

        Model choices:
        - 'all-MiniLM-L6-v2': Fast, 384-dim (recommended for CPU)
        - 'all-mpnet-base-v2': Best quality, 768-dim (recommended for GPU)
        """
        logger.info(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name

        if torch.cuda.is_available():
            self.model = self.model.to(device)

        logger.info(f"‚úÖ Model loaded on {device}")

    def encode_texts(self, texts: List[str], batch_size: int = 32, show_progress: bool = True) -> np.ndarray:
        """Generate embeddings for a list of texts"""
        logger.info(f"Encoding {len(texts)} texts...")

        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=show_progress,
            convert_to_numpy=True,
            normalize_embeddings=True  # L2 normalization
        )

        logger.info(f"‚úÖ Generated embeddings: {embeddings.shape}")
        return embeddings

    def get_embedding_dim(self) -> int:
        """Return the dimensionality of embeddings"""
        return self.model.get_sentence_embedding_dimension()

# ============================================================================
# SECTION 5: MODEL TRAINING PIPELINE
# ============================================================================

@dataclass
class ModelConfig:
    """Configuration for model training"""
    test_size: float = 0.2
    random_state: int = RANDOM_SEED
    cv_folds: int = 5

class PersonalityPredictor:
    """
    Unified interface for training and evaluating personality prediction models.

    Demonstrates:
    - Separation of concerns
    - Reproducibility
    - Extensibility
    """

    def __init__(self, config: ModelConfig = None):
        self.config = config or ModelConfig()
        self.models = {}
        self.scalers = {}
        self.results = {}

    def prepare_data(self, embeddings: np.ndarray, linguistic_features: pd.DataFrame,
                    targets: pd.DataFrame, feature_type: str = 'embeddings'):
        """
        Prepare train/test splits.

        Args:
            feature_type: 'embeddings', 'linguistic', or 'combined'
        """
        # Choose features
        if feature_type == 'embeddings':
            X = embeddings
        elif feature_type == 'linguistic':
            X = linguistic_features.values
        elif feature_type == 'combined':
            X = np.concatenate([embeddings, linguistic_features.values], axis=1)
        else:
            raise ValueError(f"Unknown feature_type: {feature_type}")

        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, targets,
            test_size=self.config.test_size,
            random_state=self.config.random_state
        )

        self.trait_names = targets.columns.tolist()
        self.feature_type = feature_type

        logger.info(f"Data prepared: {self.X_train.shape[0]} train, {self.X_test.shape[0]} test samples")

    def train_model(self, model_name: str, model, trait: str):
        """Train a model for a specific trait"""
        trait_idx = self.trait_names.index(trait)

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)

        # Train
        model.fit(X_train_scaled, self.y_train.iloc[:, trait_idx])

        # Predict
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)

        # Evaluate
        results = {
            'train_r2': r2_score(self.y_train.iloc[:, trait_idx], y_pred_train),
            'test_r2': r2_score(self.y_test.iloc[:, trait_idx], y_pred_test),
            'train_rmse': np.sqrt(mean_squared_error(self.y_train.iloc[:, trait_idx], y_pred_train)),
            'test_rmse': np.sqrt(mean_squared_error(self.y_test.iloc[:, trait_idx], y_pred_test)),
            'test_mae': mean_absolute_error(self.y_test.iloc[:, trait_idx], y_pred_test),
            'predictions': y_pred_test
        }

        # Store
        key = f"{model_name}_{trait}"
        self.models[key] = model
        self.scalers[key] = scaler
        self.results[key] = results

        return results

    def train_all_traits(self, model_name: str, model_class, **model_params):
        """Train model for all personality traits"""
        logger.info(f"Training {model_name} for all traits...")

        all_results = {}
        for trait in tqdm(self.trait_names, desc=model_name):
            model = model_class(**model_params)
            results = self.train_model(model_name, model, trait)
            all_results[trait] = results

        return all_results

    def get_results_summary(self) -> pd.DataFrame:
        """Create summary table of all results"""
        summary = []
        for key, results in self.results.items():
            model_name, trait = key.rsplit('_', 1)
            summary.append({
                'Model': model_name,
                'Trait': trait.capitalize(),
                'Train R¬≤': results['train_r2'],
                'Test R¬≤': results['test_r2'],
                'Test RMSE': results['test_rmse'],
                'Test MAE': results['test_mae']
            })

        return pd.DataFrame(summary)

# ============================================================================
# SECTION 6: VISUALIZATION FUNCTIONS
# ============================================================================

def visualize_trait_distributions(df, trait_cols):
    """Create comprehensive visualization of trait distributions"""
    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=trait_cols + ['Trait Correlations'],
        specs=[[{}, {}, {}], [{}, {}, {'type': 'heatmap'}]]
    )

    colors = px.colors.qualitative.Set2
    for idx, trait in enumerate(trait_cols):
        row = idx // 3 + 1
        col = idx % 3 + 1

        fig.add_trace(
            go.Histogram(
                x=df[trait],
                name=trait.capitalize(),
                marker_color=colors[idx],
                showlegend=False,
                nbinsx=30
            ),
            row=row, col=col
        )

    # Correlation heatmap
    corr_matrix = df[trait_cols].corr()
    fig.add_trace(
        go.Heatmap(
            z=corr_matrix.values,
            x=[t.capitalize() for t in trait_cols],
            y=[t.capitalize() for t in trait_cols],
            colorscale='RdBu',
            zmid=0,
            text=corr_matrix.values.round(2),
            texttemplate='%{text}',
            showscale=True
        ),
        row=2, col=3
    )

    fig.update_layout(
        height=800,
        title_text="<b>Big Five Personality Trait Distributions & Correlations</b>",
        showlegend=False
    )

    return fig

def visualize_embedding_space_2d(embeddings_2d, df, trait_cols):
    """Create 2D visualization of embedding space colored by traits"""
    viz_df = pd.DataFrame({
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1],
        'text_preview': df['cleaned_text'].str[:100] + '...',
        'word_count': df['word_count']
    })

    for trait in trait_cols:
        viz_df[trait] = df[trait].values

    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=[t.capitalize() for t in trait_cols] + ['Word Count'],
        specs=[[{'type': 'scatter'}]*3, [{'type': 'scatter'}]*3]
    )

    for idx, trait in enumerate(trait_cols + ['word_count']):
        row = idx // 3 + 1
        col = idx % 3 + 1

        fig.add_trace(
            go.Scatter(
                x=viz_df['x'],
                y=viz_df['y'],
                mode='markers',
                marker=dict(
                    size=4,
                    color=viz_df[trait],
                    colorscale='Viridis',
                    showscale=(col == 3),
                    opacity=0.6,
                    colorbar=dict(title=trait.capitalize())
                ),
                text=viz_df['text_preview'],
                hovertemplate=f'<b>{trait.capitalize()}: %{{marker.color:.2f}}</b><br>' +
                             'Text: %{text}<br><extra></extra>',
                showlegend=False
            ),
            row=row, col=col
        )

    fig.update_layout(
        height=900,
        title_text="<b>Semantic Embedding Space Colored by Personality Traits</b>",
    )

    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)

    return fig

def visualize_model_comparison(results_summary):
    """Visualize model performance comparison"""
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=['R¬≤ Score by Trait', 'RMSE by Trait']
    )

    for model in results_summary['Model'].unique():
        model_data = results_summary[results_summary['Model'] == model]

        fig.add_trace(
            go.Bar(
                name=model,
                x=model_data['Trait'],
                y=model_data['Test R¬≤'],
                text=model_data['Test R¬≤'].round(3),
                textposition='auto',
            ),
            row=1, col=1
        )

        fig.add_trace(
            go.Bar(
                name=model,
                x=model_data['Trait'],
                y=model_data['Test RMSE'],
                text=model_data['Test RMSE'].round(3),
                textposition='auto',
                showlegend=False
            ),
            row=1, col=2
        )

    fig.update_layout(
        height=500,
        title_text="<b>Model Performance Comparison</b>",
        barmode='group'
    )

    return fig

# ============================================================================
# SECTION 7: MAIN EXECUTION PIPELINE
# ============================================================================

def main():
    """Main execution pipeline"""

    print("="*80)
    print("NEURAL PSYCHOMETRIC EMBEDDINGS & TRAIT PREDICTION")
    print("="*80)

    # Step 1: Download and load data
    print("\nüì• STEP 1: Loading Data...")
    if not download_dataset():
        return

    df_clean, text_col, trait_cols = load_and_prepare_data()

    # Step 2: Preprocess text
    print("\nüîÑ STEP 2: Preprocessing Text...")
    preprocessor = TextPreprocessor()
    df_clean, linguistic_features = preprocessor.process_dataframe(df_clean, text_col)

    # Step 3: Generate embeddings
    print("\nüß† STEP 3: Generating Embeddings...")
    embedding_generator = EmbeddingGenerator('all-MiniLM-L6-v2')
    text_embeddings = embedding_generator.encode_texts(
        df_clean['cleaned_text'].tolist(),
        batch_size=64
    )

    # Step 4: Dimensionality reduction for visualization
    print("\nüìä STEP 4: Dimensionality Reduction...")
    reducer_2d = UMAP(n_components=2, random_state=RANDOM_SEED)
    embeddings_2d = reducer_2d.fit_transform(text_embeddings)

    # Step 5: Train models
    print("\nüèãÔ∏è STEP 5: Training Models...")
    predictor = PersonalityPredictor()
    predictor.prepare_data(
        embeddings=text_embeddings,
        linguistic_features=linguistic_features,
        targets=df_clean[trait_cols],
        feature_type='embeddings'
    )

    # Train multiple models
    ridge_results = predictor.train_all_traits('Ridge', Ridge, alpha=1.0, random_state=RANDOM_SEED)
    rf_results = predictor.train_all_traits('RandomForest', RandomForestRegressor,
                                            n_estimators=100, max_depth=10,
                                            random_state=RANDOM_SEED, n_jobs=-1)
    xgb_results = predictor.train_all_traits('XGBoost', xgb.XGBRegressor,
                                             n_estimators=100, max_depth=6,
                                             learning_rate=0.1, random_state=RANDOM_SEED)

    # Step 6: Evaluate and compare
    print("\nüìà STEP 6: Model Evaluation...")
    results_summary = predictor.get_results_summary()
    print("\n" + results_summary.to_string(index=False))

    avg_performance = results_summary.groupby('Model')['Test R¬≤'].mean().sort_values(ascending=False)
    print("\nüèÜ Average R¬≤ by Model:")
    print(avg_performance)

    # Step 7: Save results
    print("\nüíæ STEP 7: Saving Results...")

    # Save models
    best_model_name = avg_performance.index[0]
    save_dir = 'models'
    import os
    os.makedirs(save_dir, exist_ok=True)

    for trait in trait_cols:
        key = f"{best_model_name}_{trait}"
        with open(f"{save_dir}/{key}.pkl", 'wb') as f:
            pickle.dump({
                'model': predictor.models[key],
                'scaler': predictor.scalers[key],
                'results': predictor.results[key]
            }, f)

    # Save embeddings
    np.save(f"{save_dir}/text_embeddings.npy", text_embeddings)

    # Save metadata
    metadata = {
        'model_name': best_model_name,
        'embedding_model': embedding_generator.model_name,
        'traits': trait_cols,
        'n_samples': len(df_clean),
        'avg_r2': float(avg_performance.iloc[0]),
        'timestamp': datetime.now().isoformat()
    }

    with open(f"{save_dir}/metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"‚úÖ Models saved to {save_dir}/")

    print("\n" + "="*80)
    print("‚úÖ PIPELINE COMPLETE!")
    print("="*80)

    return {
        'df': df_clean,
        'embeddings': text_embeddings,
        'embeddings_2d': embeddings_2d,
        'predictor': predictor,
        'results': results_summary,
        'trait_cols': trait_cols
    }

if __name__ == "__main__":
    results = main()

üñ•Ô∏è  Using device: cpu
NEURAL PSYCHOMETRIC EMBEDDINGS & TRAIT PREDICTION

üì• STEP 1: Loading Data...
‚úÖ Dataset downloaded!


KeyError: ['neuroticism']