<a href="https://colab.research.google.com/github/0xBADC0FFEE/colabs/blob/master/CIAN_Cleaner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1. Loading BeautifulSoup and test request

from bs4 import BeautifulSoup
import requests


def load_flat_dicts(url):
    cian_html = requests.get(url).text
    soup = BeautifulSoup(cian_html)


    flats_dict = {}
    flats = soup.find_all('div', {'data-name': 'TopOfferCard'})
    for f in flats:
        flat_imgs = []

        additional_imgs = f.find_all('img', {'data-name': 'GalleryImage'})
        flat_imgs.append(f.find('img')['src'])

        for fa in additional_imgs:
            flat_imgs.append(fa['src'])

        links = f.find_all('a')
        for a in links:
            if 'www.cian.ru/rent/flat/' in a['href']:
                flats_dict[a['href']] = flat_imgs
                break

    return flats_dict

flats_dict = load_flat_dicts('https://www.cian.ru/cat.php?currency=2&deal_type=rent&engine_version=2&maxprice=25000&minprice=15000&offer_type=flat&region=1&room1=1&room2=1&type=4')

In [None]:
%%time
#2. Building model and predict one image

from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import GlobalAveragePooling2D, BatchNormalization, Activation, Dense, Dropout
from tensorflow.keras.preprocessing import image
import cv2
import numpy as np
import efficientnet.tfkeras as efn
import io
from urllib.request import urlopen

#0.96
def build_model_efficientnet():
    pretrained_model = efn.EfficientNetB0(weights='imagenet', include_top=False)
    pretrained_model.trainable = False
    x = pretrained_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(2, activation='softmax')(x)
    model = Model(inputs=pretrained_model.input, outputs=predictions)
    
    # lr=1e-4
    model.compile(loss='categorical_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])
#     model.summary()
    return model

model = build_model_efficientnet()
model = load_model('cian_model_01_B0.h5')
print('Model Loaded')


def _fast_expand(img):
    img = image.img_to_array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    return img


def pseudo_download_image(url):
#     print(f'[INFO] Downloading {url}')
    resp = urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)

    return image



def predict_image(url):
    img_size = 320
#     open_cv_image = cv2.imread(img_path)
    open_cv_image = pseudo_download_image(url)
    open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB)
    test_image = cv2.resize(open_cv_image, (img_size, img_size))
    orig_image = _fast_expand(test_image)
    result_orig = model.predict(orig_image, batch_size=1)
    
#     classes = ['bad', 'good']
    result_val = np.argmax(result_orig)
    
    return result_val


predict_image('https://cdn-p.cian.site/images/8/766/147/kvartira-moskva-lomonosovskiy-prospekt-741667887-4.jpg')

In [None]:
#3. Predict for all flats from result page

from IPython.core.display import display, HTML


def show_results(scores, threshold=0.5):
    out_html = []
    for f, scr in scores.items():
        img_thumb = scr['thumb']
        f_score = scr['score']
        if f_score >= threshold:
            out_html.append(f'<img src="{img_thumb}" width=250 /><br><a href="{f}" target="_blank">{f} //// Score: {f_score}</a>')

    display(HTML('<br>'.join(out_html)))

    
###### MAIN PART    
cian_url = 'https://www.cian.ru/cat.php?currency=2&deal_type=rent&engine_version=2&maxprice=55000&minprice=40000&offer_type=flat&region=1&room1=1&room2=1&room3=1&type=4'    
result_scores = {}
flats_dict = load_flat_dicts(cian_url)

for f, urls in flats_dict.items():
    data = {'score': 0, 'thumb': urls[0]}
    total_score = 0
    for url in urls:
        data['score'] = predict_image(url)
        total_score += data['score']

    final_score = total_score / len(urls)
    data['score'] = final_score
    result_scores[f] = data
    print(f'Score for {f}: {final_score}')


show_results(result_scores)
    