In [1]:
'''
************************************************************************************************
************************************************************************************************
Script for making TF-IDF vectors 
** IMPORTANT: This is for jason, not csv --> only for training (filtered_pandora)

*** You can play with max_features: 1000/5000/1000 (I wouldn't go more than 5000)

INPUT: change path to your filtered_pandora in input_folder
OUTPUT ("tfidf_author_data.csv"): will take the input_folder/ input_path to save the output

************************************************************************************************
************************************************************************************************

'''

import os
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

input_filename = "filtered_pandora.json"
input_folder = os.path.dirname("/Users/arashalborz/Desktop/Data/filtered_pandora.json") # change path
input_path = os.path.join(input_folder, input_filename)
save_path = os.path.join(input_folder, "tfidf_author_data.csv")

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)


texts = []  
labels = []   
author_ids = [] 

for author in data["authors"]:
    author_id = author["id"]
    author_ids.append(author_id)
    
    comments = author.get("comments", [])

    full_text = " ".join(comments)
    texts.append(full_text)
    
    # the big five traits
    trait_scores = [
        author["labels"]["openness"],
        author["labels"]["conscientiousness"],
        author["labels"]["extraversion"],
        author["labels"]["agreeableness"],
        author["labels"]["neuroticism"]
    ]
    labels.append(trait_scores)

labels = np.array(labels)  

print("Number of authors:", len(texts))
print("Labels shape:", labels.shape)


vectorizer = TfidfVectorizer(max_features=5000) 

X = vectorizer.fit_transform(texts).toarray()  

print("TF-IDF features shape:", X.shape)



df = pd.DataFrame(X)
df["openness"] = labels[:, 0]
df["conscientiousness"] = labels[:, 1]
df["extraversion"] = labels[:, 2]
df["agreeableness"] = labels[:, 3]
df["neuroticism"] = labels[:, 4]
df["author_id"] = author_ids

df.to_csv(save_path, index=False)
print(f"Saved TF-IDF data to {save_path}")

Number of authors: 1568
Labels shape: (1568, 5)
TF-IDF features shape: (1568, 5000)
Saved TF-IDF data to /Users/arashalborz/Desktop/Data/tfidf_author_data.csv


In [5]:
df = pd.read_csv("/Users/arashalborz/Desktop/Data/tfidf_author_data1.csv")
df.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '4996', '4997', '4998', '4999', 'openness', 'conscientiousness',
       'extraversion', 'agreeableness', 'neuroticism', 'author_id'],
      dtype='object', length=5006)