# Predictive Modeling on Survey Data

This notebook builds and evaluates classification models to predict overall satisfaction ratings from survey responses, using MCQ answers and text embeddings as features.

## 1. Import Required Libraries
We will use pandas, numpy, scikit-learn, matplotlib, seaborn, and optionally XGBoost/LightGBM for modeling and evaluation.

In [10]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import json
import warnings
warnings.filterwarnings('ignore')

# Optional imports
try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = None
try:
    from lightgbm import LGBMClassifier
except ImportError:
    LGBMClassifier = None

## 2. Load Survey Responses from JSON
We will load the survey responses from the JSON file (`survey-results.json`).

In [11]:
# Load the survey data from JSON file
with open('survey-results-f5ae24e1-9985-450b-b36c-878ffa7f471d.json', 'r', encoding='utf-8') as f:
    survey_data = json.load(f)

# Preview the structure of the JSON data
print('Type:', type(survey_data))
if isinstance(survey_data, dict):
    for k, v in survey_data.items():
        print(f'{k}:', str(v)[:300], '\n')
        break
elif isinstance(survey_data, list):
    print('Sample record:', survey_data[0])

Type: <class 'dict'>
survey: {'survey_id': 'f5ae24e1-9985-450b-b36c-878ffa7f471d', 'topic': 'Student AI/ML session review', 'audience': 'College Student', 'created_at': '2025-08-30T16:56:55.019098+00:00', 'questions_count': 5, 'responses_count': 5} 



## 3. Build Dataset: Target and Features
We will extract the target variable (overall rating) and use other MCQ answers and text embeddings as features.

In [12]:
# Extract records and build DataFrame
def extract_records(survey_data):
    if isinstance(survey_data, dict):
        for key in ['responses', 'data', 'results', 'answers']:
            if key in survey_data:
                records = survey_data[key]
                break
        else:
            records = list(survey_data.values())
    elif isinstance(survey_data, list):
        records = survey_data
    else:
        records = []
    return pd.DataFrame(records)

df = extract_records(survey_data)
print('Columns:', df.columns.tolist())
display(df.head())

# Identify target and feature columns
target_col = None
for col in df.columns:
    if 'overall' in col.lower() and 'rating' in col.lower():
        target_col = col
        break
if not target_col:
    print('Please update the code with the correct target column name.')

# Example: text_col = 'open_feedback' (update as needed)
text_col = None
for col in df.columns:
    if 'text' in col.lower() or 'feedback' in col.lower() or 'comment' in col.lower():
        text_col = col
        break
print('Target column:', target_col)
print('Text column:', text_col)

Columns: ['response_id', 'survey_id', 'user_id', 'user_name', 'user_email', 'responses', 'submitted_at', 'completion_time', 'user_ip']


Unnamed: 0,response_id,survey_id,user_id,user_name,user_email,responses,submitted_at,completion_time,user_ip
0,4a9fb051-773a-42f5-aba0-8f9aed440cef,f5ae24e1-9985-450b-b36c-878ffa7f471d,4a9fb051-773a-42f5-aba0-8f9aed440cef,Vinod,null@gmail.com,{'252bb7b5-f70c-4a85-a2ab-2ae6b35bb524': 'Exce...,2025-09-02T16:22:28.297071+00:00,315,
1,51d4b21f-cb53-4592-8f2b-4228d6d2d668,f5ae24e1-9985-450b-b36c-878ffa7f471d,51d4b21f-cb53-4592-8f2b-4228d6d2d668,Vicky,,{'252bb7b5-f70c-4a85-a2ab-2ae6b35bb524': 'Exce...,2025-09-01T09:06:02.399015+00:00,265,unknown
2,90a0bddd-820d-47f6-9239-6333808ab425,f5ae24e1-9985-450b-b36c-878ffa7f471d,90a0bddd-820d-47f6-9239-6333808ab425,Nutan,nutan@gmail.com,{'252bb7b5-f70c-4a85-a2ab-2ae6b35bb524': 'Good...,2025-08-30T17:14:57.538162+00:00,262,
3,657bfbf0-f8e4-43a6-aab7-97ff455c82ba,f5ae24e1-9985-450b-b36c-878ffa7f471d,657bfbf0-f8e4-43a6-aab7-97ff455c82ba,Vicky,vi@gmail.com,{'252bb7b5-f70c-4a85-a2ab-2ae6b35bb524': 'Fair...,2025-08-30T17:11:53.569637+00:00,227,
4,d05de1e4-86cb-4aca-a685-027f983ae041,f5ae24e1-9985-450b-b36c-878ffa7f471d,d05de1e4-86cb-4aca-a685-027f983ae041,Aman,aman@gmail.com,{'252bb7b5-f70c-4a85-a2ab-2ae6b35bb524': 'Good...,2025-08-30T16:58:42.383539+00:00,158,


Please update the code with the correct target column name.
Target column: None
Text column: None


## 4. Encode Categorical Features
We will encode categorical MCQ features using LabelEncoder or OneHotEncoder.

In [13]:
# Encode categorical features
feature_cols = [col for col in df.columns if col != target_col and col != text_col and df[col].dtype == 'object']

# Label encode target
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str)) if target_col else None

# One-hot encode features
X_cat = pd.get_dummies(df[feature_cols], dummy_na=True) if feature_cols else pd.DataFrame()
print('Categorical feature shape:', X_cat.shape)
display(X_cat.head())

TypeError: unhashable type: 'dict'

## 5. Convert Text Answers into Features
We will use TF-IDF vectorization or sentence embeddings for text answers.

In [None]:
# Convert text answers into features using TF-IDF
if text_col:
    tfidf = TfidfVectorizer(max_features=100)
    X_text = tfidf.fit_transform(df[text_col].fillna('')).toarray()
    print('TF-IDF feature shape:', X_text.shape)
else:
    X_text = np.empty((len(df), 0))

# Combine all features
from numpy import hstack
X = hstack([X_cat.values, X_text]) if X_cat.shape[1] > 0 or X_text.shape[1] > 0 else None
print('Final feature matrix shape:', X.shape if X is not None else None)

## 6. Train Classification Models
We will train Logistic Regression, Random Forest, and Gradient Boosting models to predict satisfaction.

In [None]:
# Train/test split
if X is not None and y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
else:
    print('Missing features or target.')

results = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
results['Logistic Regression'] = y_pred_lr

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results['Random Forest'] = y_pred_rf

# Gradient Boosting (XGBoost or LightGBM if available)
if XGBClassifier:
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)
    results['XGBoost'] = y_pred_xgb
elif LGBMClassifier:
    lgbm = LGBMClassifier()
    lgbm.fit(X_train, y_train)
    y_pred_lgbm = lgbm.predict(X_test)
    results['LightGBM'] = y_pred_lgbm
else:
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    y_pred_gbc = gbc.predict(X_test)
    results['Gradient Boosting'] = y_pred_gbc

## 7. Model Evaluation
We will evaluate each model using accuracy, F1-score, and confusion matrix.

In [None]:
# Evaluate models
for model_name, y_pred in results.items():
    print(f'\nModel: {model_name}')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('F1-score:', f1_score(y_test, y_pred, average='weighted'))
    print('Classification Report:')
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

## 8. Feature Importance
We will show feature importance for tree-based models to understand which factors best predict satisfaction.

In [None]:
# Show feature importance for tree-based models
feature_names = list(X_cat.columns) + (tfidf.get_feature_names_out().tolist() if text_col else [])

for model, name in zip([rf, gbc if 'gbc' in locals() else None], ['Random Forest', 'Gradient Boosting']):
    if model is not None:
        importances = model.feature_importances_
        indices = np.argsort(importances)[-10:][::-1]
        plt.figure(figsize=(8, 4))
        plt.title(f'Top 10 Feature Importances: {name}')
        plt.barh(range(len(indices)), importances[indices], align='center')
        plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
        plt.xlabel('Importance')
        plt.gca().invert_yaxis()
        plt.show()

## 9. Interpretation and Insights

- The most important features (from tree-based models) indicate which MCQ answers and text-derived features best predict overall satisfaction.
- Review the top features and their values to understand what drives high or low ratings.
- Use these insights to improve future surveys or interventions.