In [5]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss, precision_score, recall_score
import numpy as np

df = pd.read_csv('sample_portfolio_data.csv')

df['tags'] = df['tags'].apply(ast.literal_eval)

print(df.head())

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(df['description'])

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['tags'])

print(f"Number of samples: {X.shape[0]}, Number of features: {X.shape[1]}, Number of tags: {y.shape[1]}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=2000))
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

micro_f1 = f1_score(y_test, y_pred, average='micro')
macro_f1 = f1_score(y_test, y_pred, average='macro')
h_loss = hamming_loss(y_test, y_pred)
micro_precision = precision_score(y_test, y_pred, average='micro')
micro_recall = recall_score(y_test, y_pred, average='micro')

print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Hamming Loss: {h_loss:.4f}")
print(f"Micro Precision: {micro_precision:.4f}")
print(f"Micro Recall: {micro_recall:.4f}")

def precision_at_k(y_true, y_scores, k):
    precisions = []
    for true_labels, scores in zip(y_true, y_scores):
        top_k_indices = np.argsort(scores)[-k:]
        pred_labels = np.zeros_like(true_labels)
        pred_labels[top_k_indices] = 1
        if pred_labels.sum() > 0:
            precisions.append((true_labels * pred_labels).sum() / pred_labels.sum())
        else:
            precisions.append(0)
    return np.mean(precisions)

def recall_at_k(y_true, y_scores, k):
    recalls = []
    for true_labels, scores in zip(y_true, y_scores):
        if true_labels.sum() == 0:
            continue
        top_k_indices = np.argsort(scores)[-k:]
        pred_labels = np.zeros_like(true_labels)
        pred_labels[top_k_indices] = 1
        recalls.append((true_labels * pred_labels).sum() / true_labels.sum())
    return np.mean(recalls)

print(f"Precision@3: {precision_at_k(y_test, y_pred_proba, 3):.4f}")
print(f"Recall@3: {recall_at_k(y_test, y_pred_proba, 3):.4f}")

def predict_tags(text, top_k=3):
    text_vec = tfidf.transform([text])
    proba = clf.predict_proba(text_vec)[0]
    top_indices = np.argsort(proba)[-top_k:][::-1]
    return [(mlb.classes_[i], proba[i]) for i in top_indices]

test_samples = [
    "Wedding photographer specializing in candid moments and natural lighting",
    "Corporate headshot photographer with studio expertise",
    "Fine art landscape photographer creating dramatic black and white images"
]

for desc in test_samples:
    preds = predict_tags(desc, top_k=3)
    print(f"\nDescription: {desc}")
    print("Top predicted tags:")
    for tag, conf in preds:
        print(f"  - {tag}: {conf:.2%}")


                                         description  \
0  Creative corporate photographer with expertise...   
1  Professional photographer specializing in roma...   
2  Creative dramatic photographer with expertise ...   
3  Experienced in beauty photography and romantic...   
4  Experienced in outdoor photography and romanti...   

                                        tags  
0  [corporate, black-white, candid, product]  
1              [romantic, fashion, portrait]  
2                 [dramatic, model, outdoor]  
3               [beauty, romantic, artistic]  
4                        [outdoor, romantic]  
Number of samples: 500, Number of features: 434, Number of tags: 24
Micro F1 Score: 0.8296
Macro F1 Score: 0.8186
Hamming Loss: 0.0479
Micro Precision: 0.9180
Micro Recall: 0.7568
Precision@3: 0.9333
Recall@3: 0.8030

Description: Wedding photographer specializing in candid moments and natural lighting
Top predicted tags:
  - candid: 80.71%
  - wedding: 76.48%
  - natural-light:

In [6]:


def predict_tags(text, top_k=3):
    text_vec = tfidf.transform([text])          
    proba = clf.predict_proba(text_vec)[0]     
    top_indices = np.argsort(proba)[-top_k:][::-1] 
    return [(mlb.classes_[i], proba[i]) for i in top_indices]

test_descriptions = [
    "Wedding photographer specializing in candid moments and natural lighting",
    "Corporate headshot photographer with studio expertise",
    "Fine art landscape photographer creating dramatic black and white images",
    "Portrait photographer with strong editorial and fashion experience",
]
for desc in test_descriptions:
    predicted_tags = predict_tags(desc, top_k=3)
    print(f"\nDescription: {desc}")
    print("Predicted Tags:")
    for tag, confidence in predicted_tags:
        print(f"  - {tag}: {confidence:.2%}")



Description: Wedding photographer specializing in candid moments and natural lighting
Predicted Tags:
  - candid: 80.71%
  - wedding: 76.48%
  - natural-light: 44.66%

Description: Corporate headshot photographer with studio expertise
Predicted Tags:
  - corporate: 92.69%
  - studio: 92.23%
  - dramatic: 33.92%

Description: Fine art landscape photographer creating dramatic black and white images
Predicted Tags:
  - fine-art: 95.76%
  - black-white: 94.83%
  - dramatic: 74.00%

Description: Portrait photographer with strong editorial and fashion experience
Predicted Tags:
  - portrait: 78.99%
  - editorial: 78.37%
  - fashion: 73.10%
