### import data

In [49]:
import pandas as pd

file_path = "/Users/bryant_lue/Downloads/113-2-data-mining-homework-2"

df_train = pd.read_json(f"{file_path}/train_data.json")
df_test = pd.read_json(f"{file_path}/test_data.json")
df_train_label = pd.read_csv(f"{file_path}/train_label.csv")

In [50]:
df_train = df_train.merge(df_train_label, on="Pid")
df_train["Postdate"] = pd.to_datetime(df_train["Postdate"], unit='s')
df_test["Postdate"] = pd.to_datetime(df_test["Postdate"], unit='s')

In [51]:
df_train.head(5)

Unnamed: 0,Pid,Uid,Title,Alltags,Category,Concept,Subcategory,Postdate,img_filepath,label
0,149005,22687@N84,having a drink,life county wild bird water animal closeup fau...,Food,thirsty,Drinks,2015-03-13 03:21:30,train/22687@N84/149005.jpg,10.07
1,149948,17614@N19,"Foto Agne Sterberg, Destination Hga Kusten, AG...",hav mitt hga kusten blsippor nordingr klippor ...,Travel&Active&Sports,mitt,Baseball,2015-03-17 02:05:20,train/17614@N19/149948.jpg,6.27
2,151388,17614@N19,"Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...",is sweden sverige hav soluppgng mitt vr hga ku...,Travel&Active&Sports,mitt,Baseball,2015-03-24 21:29:17,train/17614@N19/151388.jpg,5.46
3,151389,17614@N19,"Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...",is sweden sverige hav soluppgng mitt vr hga ku...,Travel&Active&Sports,mitt,Baseball,2015-03-24 10:18:36,train/17614@N19/151389.jpg,5.39
4,151390,17614@N19,"Foto Agne Sterberg, AGMA Forntid & ventyr AB, ...",is sweden sverige hav soluppgng mitt vr hga ku...,Travel&Active&Sports,mitt,Baseball,2015-03-24 21:55:46,train/17614@N19/151390.jpg,5.36


### EDA
- train Category.unique == test Category.unique?
- train Concept.unique == test Concept.unique?
- train Category.unique == test Category.unique?

- count
- 直接找差集

In [75]:
df_train[~df_train['Category'].isin(df_test['Category'])]['Category']

Series([], Name: Category, dtype: object)

In [76]:
df_test[~df_test['Category'].isin(df_train['Category'])]['Category']

Series([], Name: Category, dtype: object)

In [77]:
df_train[~df_train['Concept'].isin(df_test['Concept'])]['Concept']

26          skywatcher
28          skywatcher
29          skywatcher
30              repeat
31            tumbling
             ...      
14988          blessed
14990    unforgettable
14993             fans
14997           repeat
14999    modifications
Name: Concept, Length: 7674, dtype: object

In [78]:
df_test[~df_test['Concept'].isin(df_train['Concept'])]['Concept']

146      cocktails
147      cocktails
302         jacket
382      motorbike
511      motorbike
           ...    
4755    gymnastics
4872      swimming
4894     earlybird
4897     earlybird
4925          shoe
Name: Concept, Length: 694, dtype: object

### Extract image features

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm

transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5]),
])

# set path
os.chdir(file_path)

# Load ResNet model
resnet = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
resnet.fc = torch.nn.Identity()
resnet.eval()

if torch.cuda.is_available() == True:
    device = torch.device("cuda")
    print("Using CUDA GPU")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using M1 Pro GPU via MPS")
else:
    device = torch.device("cpu")
    print("MPS not available; using CPU")
resnet.to(device)



def extract_features(img_paths):
    features = []
    for path in tqdm(img_paths):
        try:
            img = Image.open(path).convert('RGB')
            img = transform(img).unsqueeze(0)
            img = img.to(device)
            with torch.no_grad():
                feature = resnet(img).squeeze().cpu().numpy()
            features.append(feature)
        except Exception as e:
            print(f"Error with image {path}: {e}")
            features.append(np.zeros(2048))
    return np.array(features)

image_features_train = extract_features(df_train['img_filepath'])
print("Image features extracted for training data.")
image_features_test = extract_features(df_test['img_filepath'])
print("Image features extracted for test data.")


Using M1 Pro GPU via MPS


100%|██████████| 15000/15000 [08:43<00:00, 28.67it/s]


Image features extracted for training data.


100%|██████████| 5000/5000 [02:56<00:00, 28.30it/s]

Image features extracted for test data.





### Feature Engineering: Concating image featruesm TFIDF and onehotencoding

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np


# ----------------------------------------------------------------
# Convert image features to DataFrame
df_image_features_train = pd.DataFrame(image_features_train, index=df_train.index)
df_image_features_train.columns = [f"img_feat_{i}" for i in range(image_features_train.shape[1])]
print(image_features_train.shape)

df_image_features_test = pd.DataFrame(image_features_test, index=df_test.index)
df_image_features_test.columns = [f"img_feat_{i}" for i in range(image_features_test.shape[1])]
print(image_features_test.shape)

# Define Concatenate DataFrames
df_train_concat = pd.concat([df_train, df_image_features_train], axis=1)
df_test_concat = pd.concat([df_test, df_image_features_test], axis=1)

# ----------------------------------------------------------------
# TF-IDF Vectorization
# Combine relevant text columns
df_train['text'] = df_train['Title'].astype(str) + " " + df_train['Alltags'].astype(str)
df_test['text'] = df_test['Title'].astype(str) + " " + df_test['Alltags'].astype(str)

# TF-IDF Vectorizer for the combined text
tfidf = TfidfVectorizer(max_features=100)  # You can adjust max_features

# Fit and transform
tfidf_matrix_train = tfidf.fit_transform(df_train['text']).toarray()
tfidf_matrix_test = tfidf.fit_transform(df_test['text']).toarray()

df_ifidf_train = pd.DataFrame(tfidf_matrix_train, index=df_train.index)
df_ifidf_test = pd.DataFrame(tfidf_matrix_test, index=df_test.index)

# Optionally, rename columns
df_ifidf_train.columns = [f"tfidf_feat_{i}" for i in range(tfidf_matrix_train.shape[1])]
df_ifidf_test.columns = [f"tfidf_feat_{i}" for i in range(tfidf_matrix_test.shape[1])]
df_train_concat = pd.concat([df_train_concat, df_ifidf_train], axis=1)
df_test_concat = pd.concat([df_test_concat, df_ifidf_test], axis=1)


# You can also vectorize 'Category' (if categorical, use OneHotEncoder instead)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

category_encoded_train = ohe.fit_transform(df_train[['Category']])
df_category_train = pd.DataFrame(category_encoded_train, index=df_train.index)
df_category_train.columns = ohe.categories_[0]

category_encoded_test = ohe.fit_transform(df_test[['Category']])
df_category_test = pd.DataFrame(category_encoded_test, index=df_test.index)
df_category_test.columns = ohe.categories_[0]

# Concept
concept_encoded_train = ohe.fit_transform(df_train[['Concept']])
df_concept_train = pd.DataFrame(concept_encoded_train, index=df_train.index)
df_concept_train.columns = ohe.get_feature_names_out(['Concept'])

concept_encoded_test = ohe.transform(df_test[['Concept']])
df_concept_test = pd.DataFrame(concept_encoded_test, index=df_test.index)
df_concept_test.columns = ohe.get_feature_names_out(['Concept'])

# Align test columns to match train columns, fill missing ones with 0
df_concept_test = df_concept_test.reindex(columns=df_concept_train.columns, fill_value=0)

# Subcategory
subcat_encoded_train = ohe.fit_transform(df_train[['Subcategory']])
df_subcat_train = pd.DataFrame(subcat_encoded_train, index=df_train.index)
df_subcat_train.columns = ohe.get_feature_names_out(['Subcategory'])

subcat_encoded_test = ohe.transform(df_test[['Subcategory']])
df_subcat_test = pd.DataFrame(subcat_encoded_test, index=df_test.index)
df_subcat_test.columns = ohe.get_feature_names_out(['Subcategory'])

df_subcat_test = df_subcat_test.reindex(columns=df_subcat_train.columns, fill_value=0)


# Combine all features (text, categorical, etc.)
df_train_concat = pd.concat([df_train_concat, df_category_train, df_concept_train, df_subcat_train], axis=1)
df_test_concat = pd.concat([df_test_concat, df_category_test, df_concept_test, df_subcat_test], axis=1)

print(df_train_concat.shape)
print(df_test_concat.shape)

tfidf_category_features_train = np.hstack([tfidf_matrix_train, category_encoded_train, concept_encoded_train, subcat_encoded_train])
tfidf_category_features_test = np.hstack([tfidf_matrix_test, category_encoded_test, concept_encoded_test, subcat_encoded_test])

# Additional simple features
df_train['Posthour'] = pd.to_datetime(df_train['Postdate']).dt.hour
df_test['Posthour'] = pd.to_datetime(df_test['Postdate']).dt.hour
text_features_train = pd.get_dummies(df_train[['Posthour']], drop_first=True).values
text_features_test = pd.get_dummies(df_test[['Posthour']], drop_first=True).values

# Combine features
X = np.concatenate((image_features_train, text_features_train, tfidf_category_features_train), axis=1)
y = df_train['label']



df_hour_train = pd.DataFrame(text_features_train, index=df_train.index)
df_hour_test = pd.DataFrame(text_features_test, index=df_test.index)


df_train_concat = pd.concat([df_train_concat, df_hour_train], axis=1)
df_test_concat = pd.concat([df_test_concat, df_hour_test], axis=1)
print(df_train_concat.shape)
print(df_test_concat.shape)

df_train_concat.to_csv("train_concat.csv", index=False) 
df_test_concat.to_csv("test_concat.csv", index=False)

(15000, 2048)
(5000, 2048)
(15000, 2878)
(5000, 2878)
(15000, 2879)
(5000, 2879)


### Random Forest

In [110]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R²:', r2_score(y_test, y_pred))

MAE: 1.6532830666666665
R²: 0.2895075785925194


In [None]:
# Predict on test set
X_test_final = np.concatenate((image_features_test, text_features_test, tfidf_category_features_test), axis=1)
y_test_pred = model.predict(X_test_final)
df_test['Predicted'] = y_test_pred
df_test[['Pid', 'Predicted']].to_csv("submission_random.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv


### XGBoost

In [114]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

params = {
    'objective': 'reg:absoluteerror',
    'device': 'cuda',
    'max_depth': 10,
    'learning_rate': 0.01,
    'colsample_bytree': 0.8,
    'random_state': 42,
}

evals = [(dtrain, 'train'), (dtest, 'validation')]

model = xgb.train(
    params = params,
    dtrain = dtrain,
    num_boost_round = 10000,
    evals = evals,
    verbose_eval = 500,
    early_stopping_rounds = 100,
)

preds = model.predict(dtest)
rmse = root_mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)\

print(f'RMSE: {rmse:.3f}')
print(f'MAE: {mae:.3f}')

  self.starting_round = model.num_boosted_rounds()


[0]	train-mae:1.97111	validation-mae:2.01304
[500]	train-mae:1.30303	validation-mae:1.65284
[1000]	train-mae:0.92448	validation-mae:1.60689
[1372]	train-mae:0.74287	validation-mae:1.60555
RMSE: 2.124
MAE: 1.606


In [115]:
# Predict on test set
X_test_final = np.concatenate((image_features_test, text_features_test, tfidf_category_features_test), axis=1)
dtest = xgb.DMatrix(X_test_final)
y_test_pred = model.predict(dtest)
df_test['Predicted'] = y_test_pred
df_test[['Pid', 'Predicted']].to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
