<a href="https://colab.research.google.com/github/ChinieHan/face-rocognize/blob/main/taxonomy_classification_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch torchvision tqdm

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [2]:

!pip install transformers torch
!pip install transformers datasets sklearn datasets
!pip install datasets
!pip install fasttext transformers datasets sklearn tqdm
!pip install fasttext


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m770.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import json
from tqdm import tqdm
import fasttext
import fasttext.util

# Step 1: Data loading and cleaning
data = pd.read_csv('/content/taxonomy_engagement_data.csv')

# Function to handle metadata extraction
def extract_controller(metadata):
    if pd.isnull(metadata):
        return "UNKNOWN"
    try:
        metadata_dict = json.loads(metadata)
        return metadata_dict.get("controller", "UNKNOWN")
    except json.JSONDecodeError:
        return "UNKNOWN"

# Step 2: Processing the dataset
data['controller'] = data['engagement_metadata'].apply(extract_controller)

# Combine all text fields, including for images
data['combined_text'] = data['prompt'].fillna('') + ' ' + data['original_prompt'].fillna('') + ' ' + data['artist_style'].fillna('') + ' ' + data['text'].fillna('')

# Fill missing engagement data
data['msEngagement'] = data['msEngagement'].fillna(0)
data['is_like'] = data['is_like'].fillna(0)
data['is_dislike'] = data['is_dislike'].fillna(0)

# Step 3: Vectorization using FastText
# Load pre-trained FastText model
fasttext_model_path = '/content/drive/MyDrive/cc.en.300.bin'
ft = fasttext.load_model(fasttext_model_path)

def get_fasttext_vector(text):
    words = text.split()
    word_vectors = [ft.get_word_vector(word) for word in words]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(300)

# Show progress for vector generation
tqdm.pandas()
data['vector'] = data['combined_text'].progress_apply(lambda x: get_fasttext_vector(x))

# Step 4: Generate labels using KMeans clustering
num_text_clusters = 10
num_image_clusters = 10

# Split the data based on media_type
text_data = data[data['media_type'] == 'Text']
image_data = data[data['media_type'] == 'Image']

# Use KMeans for text classification (0-9)
from sklearn.cluster import MiniBatchKMeans
kmeans_text = MiniBatchKMeans(n_clusters=num_text_clusters, batch_size=1000, random_state=42)
text_data['label'] = kmeans_text.fit_predict(list(text_data['vector']))

# Use KMeans for image classification (10-19)
kmeans_image = MiniBatchKMeans(n_clusters=num_image_clusters, batch_size=1000, random_state=42)
image_data['label'] = kmeans_image.fit_predict(list(image_data['vector'])) + 10  # Shift labels to 10-19

# Combine the text and image data back together
data = pd.concat([text_data, image_data])

# Step 5: Split dataset (8:1:1)
train_df, temp_df = train_test_split(data[['combined_text', 'label', 'is_like', 'is_dislike', 'msEngagement']], test_size=0.2, random_state=42)
test_df, val_df = train_test_split(temp_df[['combined_text', 'label', 'is_like', 'is_dislike', 'msEngagement']], test_size=0.5, random_state=42)

# Step 6: Convert the DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

# Step 7: Tokenization using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['combined_text'], padding='max_length', truncation=True, max_length=128)
    inputs['is_like'] = examples['is_like']
    inputs['is_dislike'] = examples['is_dislike']
    inputs['msEngagement'] = examples['msEngagement']
    return inputs

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Step 8: Set dataset format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'is_like', 'is_dislike', 'msEngagement'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'is_like', 'is_dislike', 'msEngagement'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'is_like', 'is_dislike', 'msEngagement'])

# Step 9: Define model and training parameters
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20)  # 20 classes (0-19)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  #2
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Step 10: Train the model
trainer.train()

# Step 11: Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Step 12: Save the model
model.save_pretrained('./content_classification_model')
tokenizer.save_pretrained('./content_classification_model')

# Step 13: Predict on validation set
predictions = trainer.predict(val_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Step 14: Save the predictions
val_df['predicted_label'] = predicted_labels
val_df.to_csv('classification_results_with_engagement.csv', index=False)

100%|██████████| 480571/480571 [05:59<00:00, 1337.77it/s]
  super()._check_params_vs_input(X, default_n_init=3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['label'] = kmeans_text.fit_predict(list(text_data['vector']))
  super()._check_params_vs_input(X, default_n_init=3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_data['label'] = kmeans_image.fit_predict(list(image_data['vector'])) + 10  # Shift labels to 10-19
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your s

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Map:   0%|          | 0/384456 [00:00<?, ? examples/s]

Map:   0%|          | 0/48057 [00:00<?, ? examples/s]

Map:   0%|          | 0/48058 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0801,0.154572
2,0.0041,0.064662
3,0.0014,0.051338


Evaluation results: {'eval_loss': 0.051338255405426025, 'eval_runtime': 80.9382, 'eval_samples_per_second': 593.75, 'eval_steps_per_second': 37.115, 'epoch': 3.0}


In [6]:
from google.colab import files

files.download('/content/classification_results_with_engagement.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# !zip -r results.zip /content/results
!zip -r content_classification_model.zip /content/content_classification_model

# files.download('results.zip')
# files.download('content_classification_model.zip')

updating: content/content_classification_model/ (stored 0%)
updating: content/content_classification_model/model.safetensors (deflated 7%)
updating: content/content_classification_model/special_tokens_map.json (deflated 42%)
updating: content/content_classification_model/tokenizer_config.json (deflated 75%)
updating: content/content_classification_model/vocab.txt (deflated 53%)
updating: content/content_classification_model/config.json (deflated 62%)


In [1]:
# 安装 Git
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
