In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install permetrics
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import pandas as pd
import numpy as np
import shutil
from PIL import Image

In [4]:
filename = 'petfinder_multimodal_10000.zip'

In [5]:
shutil.copy('./drive/MyDrive/aj_blogs/multimodal/'+filename, 
            './'+filename)

'./petfinder_multimodal_10000.zip'

In [6]:
shutil.unpack_archive(filename)

In [7]:
df = pd.read_csv('./df_train_10000.csv')
# df['Description'] = df.Description.apply(lambda x:' '.join(str(x).split(' ')[0:80]))
df.fillna(0, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type           5000 non-null   int64  
 1   Name           5000 non-null   object 
 2   Age            5000 non-null   int64  
 3   Breed1         5000 non-null   int64  
 4   Breed2         5000 non-null   int64  
 5   Gender         5000 non-null   int64  
 6   Color1         5000 non-null   int64  
 7   Color2         5000 non-null   int64  
 8   Color3         5000 non-null   int64  
 9   MaturitySize   5000 non-null   int64  
 10  FurLength      5000 non-null   int64  
 11  Vaccinated     5000 non-null   int64  
 12  Dewormed       5000 non-null   int64  
 13  Sterilized     5000 non-null   int64  
 14  Health         5000 non-null   int64  
 15  Quantity       5000 non-null   int64  
 16  Fee            5000 non-null   int64  
 17  State          5000 non-null   int64  
 18  RescuerI

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['RescuerID'] = le.fit_transform(df['RescuerID'])

In [9]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline, AutoImageProcessor, SwinModel, ResNetForImageClassification
from sentence_transformers import SentenceTransformer


def get_text_features_archive(texts):
  model = "bert-base-uncased" #"prajjwal1/bert-tiny" ##
  feature_extractor = pipeline("feature-extraction", framework="pt", model=model, max_length=500)

  feats = feature_extractor(texts, return_tensors = "pt", max_length=500)
  hf_feats = [feat.numpy().mean(axis=1) for feat in feats]
  X_text = np.concatenate(hf_feats,axis=0)
  X_text = pd.DataFrame(X_text, columns = ['text_feats_'+str(i) for i in range(X_text.shape[1])])
  
  return X_text


def get_text_features(texts):
  model = SentenceTransformer('all-MiniLM-L6-v2')
  X_text = model.encode(texts)
  X_text = pd.DataFrame(X_text, columns = ['text_feats_'+str(i) for i in range(X_text.shape[1])])
  return X_text

def get_image_features(files):
  result = []
  image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
  model = SwinModel.from_pretrained("microsoft/swin-tiny-patch4-window7-224")

  for file in files:
      image = Image.open('./images/'+file)
      ## fix for grayscale images
      if(len(np.array(image).shape)<=2):
        image = image.convert('RGB')

      inputs = image_processor(image, return_tensors="pt")
      with torch.no_grad():
          outputs = model(**inputs)
          result.append(outputs.last_hidden_state.numpy().mean(axis=1))
  X_image = np.concatenate(result,axis=0)
  X_image = pd.DataFrame(X_image, columns = ['image_feats_'+str(i) for i in range(X_image.shape[1])])

  return X_image

def get_image_features_archive(files):
  result = []
  image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
  model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

  for file in files:
      image = Image.open('./images/'+file)
      ## fix for grayscale images
      if(len(np.array(image).shape)<=2):
        image = image.convert('RGB')

      inputs = image_processor(image, return_tensors="pt")
      with torch.no_grad():
          outputs = model(**inputs)
          result.append(outputs.logits)
  X_image = np.concatenate(result,axis=0)
  X_image = pd.DataFrame(X_image, columns = ['image_feats_'+str(i) for i in range(X_image.shape[1])])

  return X_image

In [10]:
X_text = get_text_features(df['Description'].apply(str).tolist())
X_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Columns: 384 entries, text_feats_0 to text_feats_383
dtypes: float32(384)
memory usage: 7.3 MB


In [11]:
X_text.head()

Unnamed: 0,text_feats_0,text_feats_1,text_feats_2,text_feats_3,text_feats_4,text_feats_5,text_feats_6,text_feats_7,text_feats_8,text_feats_9,...,text_feats_374,text_feats_375,text_feats_376,text_feats_377,text_feats_378,text_feats_379,text_feats_380,text_feats_381,text_feats_382,text_feats_383
0,-0.055864,0.036113,-0.042333,0.040533,-0.002074,-0.032626,0.028647,-0.054291,-0.067406,0.018985,...,-0.006435,0.01493,0.018306,-0.016367,-0.015084,0.032759,-0.020093,-0.016441,-0.055311,0.068471
1,0.131873,-0.027114,0.072861,-0.013741,-0.0787,0.048676,0.016105,-0.070175,-0.046369,-0.024615,...,0.022209,0.035246,-0.020841,-0.075633,0.062586,0.064207,-0.00918,0.024257,-0.050916,0.081012
2,0.009019,0.001301,0.054341,0.060835,-0.075739,-0.016929,-0.007254,-0.058762,-0.027672,0.039672,...,-0.005494,-0.008231,-0.087117,-0.027242,0.056372,0.072889,0.022801,0.001415,-0.001804,0.063265
3,0.026912,-5.6e-05,0.098001,0.017524,-0.018072,0.00638,0.022057,0.018562,-0.10936,0.059209,...,-0.042486,-0.010978,0.059167,-0.060073,0.023529,0.050049,0.029518,-0.029244,-0.05395,0.075529
4,0.004821,0.042974,0.077345,0.063499,-0.032919,0.017724,-0.040053,-0.020195,-0.082335,0.081807,...,0.0479,-0.009566,0.019203,-0.012007,0.100258,0.104059,0.033488,-0.006173,-0.00994,0.085693


In [12]:
X_image = get_image_features_archive(df.image.tolist())
X_image.info()

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Columns: 1000 entries, image_feats_0 to image_feats_999
dtypes: float32(1000)
memory usage: 19.1 MB


In [13]:
X_image.head()

Unnamed: 0,image_feats_0,image_feats_1,image_feats_2,image_feats_3,image_feats_4,image_feats_5,image_feats_6,image_feats_7,image_feats_8,image_feats_9,...,image_feats_990,image_feats_991,image_feats_992,image_feats_993,image_feats_994,image_feats_995,image_feats_996,image_feats_997,image_feats_998,image_feats_999
0,-10.499865,-9.520895,-10.757131,-9.699095,-9.028884,-10.106402,-8.991809,-8.697716,-7.185675,-11.444414,...,-8.668247,-10.53858,-10.937279,-11.859535,-9.846773,-9.661818,-10.278477,-11.559929,-9.149862,-8.739675
1,-10.622572,-8.168238,-10.377387,-9.689456,-10.481196,-9.909303,-7.874952,-10.960871,-10.367256,-11.200552,...,-6.771517,-9.789744,-9.654822,-9.866491,-9.942434,-9.38368,-10.089779,-9.12589,-7.917606,-5.553684
2,-10.297453,-10.135164,-9.294044,-8.123441,-10.017184,-7.407616,-9.190135,-8.128235,-10.749225,-10.825787,...,-10.714853,-9.25855,-10.169351,-12.359625,-10.064281,-9.956339,-10.373635,-11.407882,-9.214044,-6.789722
3,-10.877998,-12.164392,-10.898578,-10.897161,-12.012531,-9.979495,-11.546069,-9.748964,-9.766516,-10.405624,...,-9.675813,-10.422476,-12.287981,-10.509497,-10.177658,-11.04265,-9.191194,-10.008609,-8.518277,-6.168948
4,-10.93565,-10.22855,-10.379015,-10.572523,-10.185298,-10.051803,-9.995068,-8.863029,-8.379204,-10.323561,...,-7.792486,-11.019537,-11.394686,-12.69593,-11.154307,-10.13538,-9.946896,-10.006126,-9.492619,-6.583462


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
structured_cols = ['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
                   'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
                   'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
                   'PhotoAmt', 'Desc_len']

In [16]:
X_full = pd.concat([df, X_text, X_image], axis=1)
X_full.shape

(5000, 1410)

In [17]:
X_train, X_test = train_test_split(X_full, test_size=0.2)
X_train.shape, X_test.shape

((4000, 1410), (1000, 1410))

In [18]:
from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, objective='binary:logistic')
clf.fit(X_train[structured_cols], X_train['AdoptionSpeed'])

In [19]:
preds = clf.predict(X_test[structured_cols])

In [20]:
from permetrics.classification import ClassificationMetric
eval = ClassificationMetric(X_test['AdoptionSpeed'].values, preds)

In [21]:
eval.f1_score(), eval.accuracy_score()*100

(0.63114, 63.4)

In [22]:
text_cols = [col for col in X_train.columns if col.find('text')!=-1]
image_cols = [col for col in X_train.columns if col.find('image_feats')!=-1]

In [28]:
clf_full = XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.1, objective='binary:logistic')
clf_full.fit(X_train[structured_cols+text_cols
                     +image_cols
                     ], X_train['AdoptionSpeed'])

In [29]:
preds_full = clf_full.predict(X_test[structured_cols+text_cols
                                     +image_cols
                                     ])

In [30]:
eval_full = ClassificationMetric(X_test['AdoptionSpeed'].values, preds_full)
eval_full.f1_score(), eval_full.accuracy_score()*100, eval.f1_score(), eval.accuracy_score()*100

(0.6432, 64.5, 0.63114, 63.4)

In [31]:
(eval_full.f1_score()-eval.f1_score())*100/eval.f1_score()

1.9108280254777006

In [32]:
feature_importance  = pd.DataFrame({'Feature':clf_full.feature_names_in_,
                                    'Importance':clf_full.feature_importances_})
feature_importance.sort_values(['Importance'], ascending=False, inplace=True)
feature_importance

Unnamed: 0,Feature,Importance
631,image_feats_227,0.005149
1117,image_feats_713,0.004096
1203,image_feats_799,0.003915
572,image_feats_168,0.003190
2,Breed1,0.003103
...,...,...
1158,image_feats_754,0.000000
11,Dewormed,0.000000
13,Health,0.000000
940,image_feats_536,0.000000
