# Krisha.kz Scraper Runner

This notebook uses the scraper defined in `app/krisha_parser.py` to collect listings from krisha.kz and extract structured features, including YEAR, LONGITUDE, LATITUDE, TOTAL AREA, ROOMS, FLOOR, TOTAL_FLOORS, FURNITURE, CONDITION, CEILING, MATERIAL, CITY and more.

Notes:
- Use respectful rates and avoid excessive requests.
- Set `pages` and `max_listings` to small numbers while testing.
- The scraper relies on the current site structure and may need tweaks over time.

In [5]:
# Install a pandas version compatible with current NumPy (fixes binary mismatch errors)
import sys, os, subprocess
print(sys.version)
try:
    # Pandas 2.2.x supports NumPy 2.x; pick a stable patch
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas==2.2.3"], check=False)
    import pandas as _pd
    print("pandas", _pd.__version__, "ready")
except Exception as e:
    print("Pandas install check failed:", e)

3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
Pandas install check failed: C extension: pandas.compat._constants not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.
Pandas install check failed: C extension: pandas.compat._constants not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.


In [6]:
# Make the 'app' folder importable and import the scraper
import sys, os
app_path = os.path.join(os.getcwd(), 'app')
if app_path not in sys.path:
    sys.path.insert(0, app_path)
from krisha_parser import KrishaScraper
print('KrishaScraper imported from', app_path)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\00055794\AppData\Local\anaconda3__\Lib\site-packages\pandas\__init__.py", line 39, in <module>
    from pandas.compat import (
  File "c:\Users\00055794\AppData\Local\anaconda3__\Lib\site-packages\pandas\compat\__init__.py", line 17, in <module>
    from pandas.compat._constants import (
ImportError: cannot import name 'ISMUSL' from 'pandas.compat._constants' (c:\Users\00055794\AppData\Local\anaconda3__\Lib\site-packages\pandas\compat\_constants.py)

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\00055794\AppData\Local\anaconda3__\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\00055794\AppData\Local\Temp\ipykernel_24172\3988191431.py", line 6, in <module>
    from krisha_parser import KrishaScraper
  File "c:\Users\00055794\Desktop\Gulnaz\PROJECTS\House_Prices\10

In [None]:
# Configure scraping parameters
search_url = 'https://krisha.kz/prodazha/kvartiry/almaty/'  # Replace with your filtered search URL
pages = 1                 # Number of result pages to scan
max_listings = 20         # Stop after N listings (None to scrape all found)
delay_range = (1.0, 2.5)  # Polite random delay between requests (seconds)
output_path = 'krisha_listings.csv'  # .csv or .parquet

In [None]:
# Run the scraper
scraper = KrishaScraper(delay_range=delay_range)
df = scraper.scrape(search_url, pages=pages, max_listings=max_listings)
print(f'Scraped {len(df)} rows')
df.head(10)

In [None]:
# Save results to CSV or Parquet
from pathlib import Path
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
if out.suffix.lower() == '.parquet':
    df.to_parquet(out, index=False)
else:
    df.to_csv(out, index=False)
print(f'Saved to {out.resolve()}')

In [None]:
# Quick feature sanity check
requested = ['YEAR','LONGITUDE','LATITUDE','TOTAL AREA','ROOMS','FLOOR','TOTAL_FLOORS','FURNITURE','CONDITION','CEILING','MATERIAL','CITY']
print('Columns:', sorted(df.columns))
print('Missing requested features:', [c for c in requested if c not in df.columns])
df[requested].head(10) if all(c in df.columns for c in requested) else df.head(5)

In [None]:
# Optional: run the CLI from inside the notebook (uncomment to use)
# import sys, subprocess
# cmd = [sys.executable, "app/scrape_krisha.py", search_url, "--pages", str(pages), "--max-listings", str(max_listings), "--out", output_path]
# print("Running:", " ".join(cmd))
# subprocess.run(cmd, check=True)