# Personality Prediction: Full Pipeline Serialization

This notebook builds and serializes the full pipeline (preprocessing + trained model) for deployment or inference, using the same logic as the main model notebook.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

## Load Data

We use the same data and features as in the main notebook.

In [2]:
train_df = pd.read_csv('train.csv')

# Use the same feature extraction logic as the main notebook
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'id' in numerical_features: numerical_features.remove('id')
if 'Personality' in numerical_features: numerical_features.remove('Personality')

categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()
if 'Personality' in categorical_features: categorical_features.remove('Personality')

X = train_df[numerical_features + categorical_features]
y = train_df['Personality']

## Build Preprocessing and Model Pipeline

In [3]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# If you have a tuned model, load it here. Otherwise, fit a new one.
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])

## Train and Serialize the Pipeline

In [4]:
full_pipeline.fit(X, y)

pipeline_filename = 'full_personality_prediction_pipeline.pkl'
joblib.dump(full_pipeline, pipeline_filename)
print(f"Pipeline has been saved to {pipeline_filename}")

Pipeline has been saved to full_personality_prediction_pipeline.pkl


## Load and Use the Saved Pipeline

In [5]:
loaded_pipeline = joblib.load('full_personality_prediction_pipeline.pkl')

# Example: Predict on new data (replace with your actual data structure)
example_data = pd.DataFrame({
    col: [X[col].iloc[0]] for col in X.columns
})

predictions = loaded_pipeline.predict(example_data)
print("Example predictions:", predictions)

Example predictions: ['Extrovert']
