 # requirements

In [4]:
# base
import sys
import os
# cleaning
import numpy as np
import pandas as pd
import random
import re
#graph
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
# store
import pickle
# embbeds
import torch
import torch.nn.functional as F
import open_clip
# modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
# ensamble
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
MODEL_ID = "hf-hub:timm/ViT-B-16-SigLIP-i18n-256"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, preprocess = open_clip.create_model_from_pretrained(
    MODEL_ID,
    device=device,
    precision='fp16'
)
model.to(device)
model.eval()
tokenizer = open_clip.get_tokenizer(MODEL_ID)

  checkpoint = torch.load(checkpoint_path, map_location=map_location)


In [7]:
def get_prompt_embeddings(prompt: str):
    text_input = tokenizer(prompt).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_input)
        text_features = text_features/text_features.norm(dim=1, keepdim=True)
    return text_features

 # data

In [8]:
df = pd.read_csv('../data/train.csv')
df

Unnamed: 0,uid,city,description,homeType,latitude,longitude,garageSpaces,hasSpa,yearBuilt,numOfPatioAndPorchFeatures,lotSizeSqFt,avgSchoolRating,MedianStudentsPerTeacher,numOfBathrooms,numOfBedrooms,priceRange
0,1748,austin,MULTIPLE OFFERS submit best & final to Agent b...,Single Family,30.380089,-97.800621,0,False,1988,0,102366.0,7.000000,17,4.0,4,650000+
1,13380,austin,"4644 Hoffman Dr, Austin, TX 78749 is a single ...",Single Family,30.199486,-97.859947,0,False,1997,0,6534.0,6.666667,16,3.0,4,350000-450000
2,4115,austin,"6804 Canal St, Austin, TX 78741 is a single fa...",Single Family,30.227398,-97.696083,0,False,1952,0,5619.0,3.333333,11,1.0,2,0-250000
3,6926,austin,Beautiful large lot with established trees. Lo...,Single Family,30.205469,-97.792351,4,False,1976,0,6416.0,4.000000,14,2.0,4,0-250000
4,14480,austin,Stunning NW Hills designer remodel by Cedar an...,Single Family,30.345106,-97.767426,2,False,1984,0,10759.0,7.000000,16,3.0,5,650000+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8614,austin,Updated Great Hills Opportunity! Large corner ...,Single Family,30.409927,-97.763390,2,False,1982,3,15246.0,7.000000,17,3.0,3,650000+
9996,10505,austin,Adorable 3/2 in the heart of South Austin! Sit...,Single Family,30.206074,-97.779472,0,False,1970,0,6577.0,4.000000,14,2.0,3,250000-350000
9997,6942,austin,"7322 Gaines Mill Ln, Austin, TX 78745 is a sin...",Single Family,30.198936,-97.810913,0,False,1981,1,7623.0,4.000000,13,2.0,3,350000-450000
9998,1845,austin,"5213 Doe Valley Ln, Austin, TX 78759 is a sing...",Single Family,30.391880,-97.761292,0,False,1979,2,13939.2,7.000000,17,3.0,3,250000-350000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   uid                         10000 non-null  int64  
 1   city                        10000 non-null  object 
 2   description                 9999 non-null   object 
 3   homeType                    10000 non-null  object 
 4   latitude                    10000 non-null  float64
 5   longitude                   10000 non-null  float64
 6   garageSpaces                10000 non-null  int64  
 7   hasSpa                      10000 non-null  bool   
 8   yearBuilt                   10000 non-null  int64  
 9   numOfPatioAndPorchFeatures  10000 non-null  int64  
 10  lotSizeSqFt                 10000 non-null  float64
 11  avgSchoolRating             10000 non-null  float64
 12  MedianStudentsPerTeacher    10000 non-null  int64  
 13  numOfBathrooms              1000

In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
uid,10000.0,7626.6219,4380.486852,1.0,3838.75,7603.5,11435.75,15170.0
latitude,10000.0,30.291191,0.097075,30.08503,30.202516,30.283664,30.366375,30.51732
longitude,10000.0,-97.778476,0.084543,-98.020477,-97.838594,-97.76968,-97.718313,-97.57063
garageSpaces,10000.0,1.2296,1.328179,0.0,0.0,1.0,2.0,22.0
yearBuilt,10000.0,1988.5704,21.515272,1905.0,1975.0,1993.0,2006.0,2020.0
numOfPatioAndPorchFeatures,10000.0,0.6672,0.986378,0.0,0.0,0.0,1.0,8.0
lotSizeSqFt,10000.0,20710.28398,448833.815153,100.0,6534.0,8189.0,10890.0,34154520.0
avgSchoolRating,10000.0,5.766236,1.86194,2.333333,4.0,5.666667,7.0,9.5
MedianStudentsPerTeacher,10000.0,14.8577,1.748473,10.0,14.0,15.0,16.0,19.0
numOfBathrooms,10000.0,2.6921,0.979206,1.0,2.0,3.0,3.0,10.0


 # feature selection
 ## objective variable

In [21]:
priceRange_map = {i: priceRange for priceRange, i in df['priceRange'].value_counts(dropna=False).sort_values().reset_index().to_dict()['priceRange'].items()}
df['priceRange'] = df['priceRange'].map(priceRange_map)
df[['priceRange']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
priceRange,10000.0,2.2696,1.334726,0.0,1.0,2.0,3.0,4.0


 ## predictive features

In [22]:
df['description']

0       MULTIPLE OFFERS submit best & final to Agent b...
1       4644 Hoffman Dr, Austin, TX 78749 is a single ...
2       6804 Canal St, Austin, TX 78741 is a single fa...
3       Beautiful large lot with established trees. Lo...
4       Stunning NW Hills designer remodel by Cedar an...
                              ...                        
9995    Updated Great Hills Opportunity! Large corner ...
9996    Adorable 3/2 in the heart of South Austin! Sit...
9997    7322 Gaines Mill Ln, Austin, TX 78745 is a sin...
9998    5213 Doe Valley Ln, Austin, TX 78759 is a sing...
9999    Beautiful single-story home with open floor pl...
Name: description, Length: 10000, dtype: object

 ## predictive features

In [23]:
df['description'][0]

'MULTIPLE OFFERS submit best & final to Agent by Mon 21st - 5pm. Appt with Agent.  RARE PANORAMIC VIEW LOT IN JESTER ESTATES SEE FOR MILES!!  Home sits on Cul-de-sac & backs to a Preserve.  Stunning remodeled Kitchen & Bathrooms. Master suite is a private sanctuary with chic master bath, huge bedroom, walk-in closet & private deck.  Jester has a pool, park, tennis courts & feeds into Anderson High.  This home has been well loved & features 3 living areas, an office, & 3 car garage.'

 ## predictive features

In [24]:
get_prompt_embeddings(df['description'][0])[0]

tensor([-2.1191e-03,  1.2028e-04,  3.1757e-03,  8.7433e-03, -2.4185e-02,
        -2.8777e-04,  2.1240e-02, -2.6917e-02, -3.5156e-02,  1.8692e-02,
         7.9269e-03, -6.3210e-03, -2.8946e-02, -5.0201e-02, -2.6688e-02,
        -3.8662e-03,  1.3809e-02, -3.1204e-03, -1.5503e-02,  1.3412e-02,
         3.8940e-02, -1.4221e-02,  9.7046e-03,  1.9836e-02,  2.5787e-02,
         3.2593e-02, -2.1301e-02,  3.1464e-02,  3.4210e-02, -6.1150e-03,
         8.5831e-03,  1.8417e-02,  3.8433e-03,  3.2104e-02,  1.4210e-03,
         4.7424e-02, -9.6436e-03,  2.1774e-02, -9.4147e-03, -4.0283e-03,
         4.5807e-02,  2.0813e-02, -3.9703e-02, -1.5778e-02,  2.2491e-02,
         4.7073e-03, -4.8676e-03, -4.6196e-03,  3.3112e-02,  1.3161e-02,
         1.4725e-02,  2.9175e-02, -6.5269e-03, -3.0960e-02,  4.2939e-04,
        -3.4790e-02, -3.9825e-03, -3.8071e-03, -1.8387e-02,  1.3781e-03,
        -3.6652e-02,  3.7140e-02,  7.0190e-03, -5.9662e-03, -2.5009e-02,
         4.0855e-03,  2.9968e-02,  4.0100e-02, -2.1

 ## predictive features

In [25]:
len(get_prompt_embeddings(df['description'][0])[0])

768

In [26]:
[get_prompt_embeddings(x) for x in df['description'][:10]]

[tensor([[-2.1191e-03,  1.2028e-04,  3.1757e-03,  8.7433e-03, -2.4185e-02,
          -2.8777e-04,  2.1240e-02, -2.6917e-02, -3.5156e-02,  1.8692e-02,
           7.9269e-03, -6.3210e-03, -2.8946e-02, -5.0201e-02, -2.6688e-02,
          -3.8662e-03,  1.3809e-02, -3.1204e-03, -1.5503e-02,  1.3412e-02,
           3.8940e-02, -1.4221e-02,  9.7046e-03,  1.9836e-02,  2.5787e-02,
           3.2593e-02, -2.1301e-02,  3.1464e-02,  3.4210e-02, -6.1150e-03,
           8.5831e-03,  1.8417e-02,  3.8433e-03,  3.2104e-02,  1.4210e-03,
           4.7424e-02, -9.6436e-03,  2.1774e-02, -9.4147e-03, -4.0283e-03,
           4.5807e-02,  2.0813e-02, -3.9703e-02, -1.5778e-02,  2.2491e-02,
           4.7073e-03, -4.8676e-03, -4.6196e-03,  3.3112e-02,  1.3161e-02,
           1.4725e-02,  2.9175e-02, -6.5269e-03, -3.0960e-02,  4.2939e-04,
          -3.4790e-02, -3.9825e-03, -3.8071e-03, -1.8387e-02,  1.3781e-03,
          -3.6652e-02,  3.7140e-02,  7.0190e-03, -5.9662e-03, -2.5009e-02,
           4.0855e-03,  2

In [27]:
pathfile = '../data/1738232105_tuple_embbeds.pkl'
with open(pathfile, 'rb') as file:
    uploaded_embbeds = pickle.load(file)

In [28]:
type(uploaded_embbeds)

dict

In [30]:
list(uploaded_embbeds.keys())[:10]

[1748, 13380, 4115, 6926, 14480, 13448, 1996, 11353, 4643, 3118]

In [31]:
df['uid'][:10]

0     1748
1    13380
2     4115
3     6926
4    14480
5    13448
6     1996
7    11353
8     4643
9     3118
Name: uid, dtype: int64

In [32]:
list(uploaded_embbeds.values())[:1][-760:]

[[-0.00212,
  0.0001203,
  0.003176,
  0.00874,
  -0.02419,
  -0.0002878,
  0.02124,
  -0.02692,
  -0.03516,
  0.01869,
  0.00793,
  -0.00632,
  -0.02895,
  -0.0502,
  -0.02669,
  -0.003866,
  0.01381,
  -0.00312,
  -0.0155,
  0.01341,
  0.03894,
  -0.01422,
  0.009705,
  0.01984,
  0.02579,
  0.0326,
  -0.0213,
  0.03146,
  0.0342,
  -0.006115,
  0.00858,
  0.01842,
  0.003843,
  0.0321,
  0.001421,
  0.04742,
  -0.00964,
  0.02177,
  -0.009415,
  -0.00403,
  0.0458,
  0.02081,
  -0.0397,
  -0.01578,
  0.02249,
  0.004707,
  -0.004868,
  -0.00462,
  0.0331,
  0.01316,
  0.014725,
  0.02917,
  -0.006527,
  -0.03096,
  0.0004294,
  -0.0348,
  -0.003983,
  -0.003807,
  -0.01839,
  0.001378,
  -0.03665,
  0.03714,
  0.00702,
  -0.005966,
  -0.02501,
  0.004086,
  0.02997,
  0.0401,
  -0.02124,
  -0.02846,
  0.0005035,
  0.01953,
  0.002785,
  -0.01183,
  0.01544,
  -0.02696,
  0.01917,
  -0.0163,
  -0.00813,
  0.005184,
  0.04745,
  0.003193,
  -0.0433,
  0.001337,
  -0.058,
  -0.03748,
 

In [33]:
df['embbed_features'] = df['uid'].map(uploaded_embbeds)
df['embbed_features']

0       [-0.00212, 0.0001203, 0.003176, 0.00874, -0.02...
1       [-0.05307, 0.03354, 0.03824, -0.004944, -0.058...
2       [-0.04028, 0.02869, 0.03516, 0.01851, -0.05032...
3       [-0.014435, 0.04114, 0.01441, 0.001346, -0.066...
4       [-0.01271, 0.02371, -0.02356, -0.007164, -0.02...
                              ...                        
9995    [-0.010025, 0.05133, 0.02106, 0.01465, -0.0673...
9996    [-0.01572, 0.01168, 0.02174, 0.0198, -0.0306, ...
9997    [-0.04633, 0.05197, 0.04196, 0.01707, -0.0628,...
9998    [-0.0534, 0.03073, 0.04343, 0.01219, -0.05917,...
9999    [-0.01214, 0.03958, -0.00664, 0.00759, -0.0598...
Name: embbed_features, Length: 10000, dtype: object

 # sampling

In [34]:
df.columns

Index(['uid', 'city', 'description', 'homeType', 'latitude', 'longitude',
       'garageSpaces', 'hasSpa', 'yearBuilt', 'numOfPatioAndPorchFeatures',
       'lotSizeSqFt', 'avgSchoolRating', 'MedianStudentsPerTeacher',
       'numOfBathrooms', 'numOfBedrooms', 'priceRange', 'embbed_features'],
      dtype='object')

In [35]:
df_train = df[['priceRange', 'embbed_features']].copy()
df_train

Unnamed: 0,priceRange,embbed_features
0,1,"[-0.00212, 0.0001203, 0.003176, 0.00874, -0.02..."
1,3,"[-0.05307, 0.03354, 0.03824, -0.004944, -0.058..."
2,0,"[-0.04028, 0.02869, 0.03516, 0.01851, -0.05032..."
3,0,"[-0.014435, 0.04114, 0.01441, 0.001346, -0.066..."
4,1,"[-0.01271, 0.02371, -0.02356, -0.007164, -0.02..."
...,...,...
9995,1,"[-0.010025, 0.05133, 0.02106, 0.01465, -0.0673..."
9996,4,"[-0.01572, 0.01168, 0.02174, 0.0198, -0.0306, ..."
9997,3,"[-0.04633, 0.05197, 0.04196, 0.01707, -0.0628,..."
9998,4,"[-0.0534, 0.03073, 0.04343, 0.01219, -0.05917,..."


In [36]:
X = np.array(list(df_train['embbed_features']))
y = np.array(list(df_train['priceRange']))
print(f'X shape: {X.shape}, y shape: {y.shape}')

X shape: (10000, 768), y shape: (10000,)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=2025)

 # modeling

In [38]:
rf = RandomForestClassifier(
    random_state=2025,
    n_jobs=-1,
    class_weight='balanced'
)

In [43]:
param_grid = {
    'max_features':['log2'],
    'max_depth': [5],
    'min_samples_leaf': [0.001],
    'min_samples_split': [15],
    'criterion':['gini'],
    'n_estimators':[50]
}

In [44]:
CV_rfc = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5
)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              n_jobs=-1, random_state=2025),
             param_grid={'criterion': ['gini'], 'max_depth': [5],
                         'max_features': ['log2'], 'min_samples_leaf': [0.001],
                         'min_samples_split': [15], 'n_estimators': [50]},
             scoring='accuracy')

In [46]:
print(f"Best params {CV_rfc.best_params_}")

Best params {'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 0.001, 'min_samples_split': 15, 'n_estimators': 50}


In [47]:
print(f"Best score {CV_rfc.best_score_}")

Best score 0.3221428571428572


In [49]:
y_hat = CV_rfc.predict(X_test)
print(f'Accuracy for Random fores on test data is: {accuracy_score(y_test,y_hat)}')

Accuracy for Random fores on test data is: 0.29933333333333334


In [52]:
print(f"F1-score for Random fores on test data is: {f1_score(y_test,y_hat, average='micro')}")

F1-score for Random fores on test data is: 0.29933333333333334


In [54]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.20      0.45      0.28       362
           1       0.39      0.60      0.47       550
           2       0.28      0.21      0.24       642
           3       0.28      0.12      0.17       698
           4       0.34      0.25      0.29       748

    accuracy                           0.30      3000
   macro avg       0.30      0.33      0.29      3000
weighted avg       0.30      0.30      0.28      3000



 # saving

In [55]:
timestamp = int(datetime.utcnow().timestamp())
pathfile = f'../data/{timestamp}_rf_op768_description.pkl'
print(f'the model is saved in :{pathfile}')
pickle.dump(CV_rfc, open(pathfile, 'wb'))

the model is saved in :../data/1738236881_rf_op768_description.pkl


 # local validation

In [56]:
pathfile = '../data/1738236881_rf_op768_description.pkl'
with open(pathfile, 'rb') as file:
    uploaded_cv_rfc = pickle.load(file)

In [57]:
y_hat = uploaded_cv_rfc.predict(X_test)
print(f'Accuracy for Random fores on test data is: {accuracy_score(y_test,y_hat)}')

Accuracy for Random fores on test data is: 0.29933333333333334


In [58]:
print(f"F1-score for Random fores on test data is: {f1_score(y_test,y_hat, average='micro')}")

F1-score for Random fores on test data is: 0.29933333333333334


In [59]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       0.20      0.45      0.28       362
           1       0.39      0.60      0.47       550
           2       0.28      0.21      0.24       642
           3       0.28      0.12      0.17       698
           4       0.34      0.25      0.29       748

    accuracy                           0.30      3000
   macro avg       0.30      0.33      0.29      3000
weighted avg       0.30      0.30      0.28      3000

