# **Gentoo Distro File Web Scrapping ETL**

### ● Script to produce a list of every file under - https://gentoo.osuosl.org/distfiles/

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [30]:
base_url = "https://gentoo.osuosl.org/distfiles/"
output_filename = "./gentoo_distfiles_list.csv"

In [None]:
def get_gentoo_distfiles(url):

    """
    Scrapes and retrieves a list of all files under a given URL.
    """
    try:
        page = requests.get(url).text #Extract web page as text
        data = BeautifulSoup(page,'html.parser') #Parse the text into an HTML object
        
        files = [] # entries

        for row in data.find_all('tr')[1:]:  # Skip header row
            columns = row.find_all('td')
            if columns:
                name = columns[1].text.strip() # Get the name of the file or directory
                last_modified = columns[2].text.strip() # Get the last modified date
                size = columns[3].text.strip()  # Get the size

                # Skip unwanted
                if name.lower() == "parent directory" or name == "":
                    continue

                files.append({'name': name, 'last_modified': last_modified, 'size': size})
        
        return files
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []

# Scrape the main directory
files_list = get_gentoo_distfiles(base_url)

# Print the results
for file_info in files_list:
    display(f"Name: {file_info['name']}, Last Modified: {file_info['last_modified']}, Size: {file_info['size']}")

# Save results to a CSV file

df = pd.DataFrame(files_list)
df.to_csv(output_filename, index=False)


'Name: 0a/, Last Modified: 2025-02-23 15:53, Size: -'

'Name: 0b/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 0c/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 0d/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 0e/, Last Modified: 2025-02-24 08:52, Size: -'

'Name: 0f/, Last Modified: 2025-02-23 04:53, Size: -'

'Name: 00/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 01/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 02/, Last Modified: 2025-02-24 22:54, Size: -'

'Name: 03/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 04/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 05/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 06/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 07/, Last Modified: 2025-02-23 20:53, Size: -'

'Name: 08/, Last Modified: 2025-02-23 05:52, Size: -'

'Name: 09/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 1a/, Last Modified: 2025-02-24 12:53, Size: -'

'Name: 1b/, Last Modified: 2025-02-23 11:58, Size: -'

'Name: 1c/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 1d/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 1e/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 1f/, Last Modified: 2025-02-23 00:52, Size: -'

'Name: 2a/, Last Modified: 2025-02-24 01:53, Size: -'

'Name: 2b/, Last Modified: 2025-02-23 02:52, Size: -'

'Name: 2c/, Last Modified: 2025-02-23 22:53, Size: -'

'Name: 2d/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 2e/, Last Modified: 2025-02-22 15:53, Size: -'

'Name: 2f/, Last Modified: 2025-02-22 22:55, Size: -'

'Name: 3a/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: 3b/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 3c/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 3d/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 3e/, Last Modified: 2025-02-23 09:54, Size: -'

'Name: 3f/, Last Modified: 2025-02-24 05:53, Size: -'

'Name: 4a/, Last Modified: 2025-02-24 00:53, Size: -'

'Name: 4b/, Last Modified: 2025-02-22 10:54, Size: -'

'Name: 4c/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 4d/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 4e/, Last Modified: 2025-02-24 04:52, Size: -'

'Name: 4f/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 5a/, Last Modified: 2025-02-23 09:53, Size: -'

'Name: 5b/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 5c/, Last Modified: 2025-02-23 01:54, Size: -'

'Name: 5d/, Last Modified: 2025-02-23 14:53, Size: -'

'Name: 5e/, Last Modified: 2025-02-22 22:58, Size: -'

'Name: 5f/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 6a/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 6b/, Last Modified: 2025-02-24 22:54, Size: -'

'Name: 6c/, Last Modified: 2025-02-23 15:53, Size: -'

'Name: 6d/, Last Modified: 2025-02-22 08:52, Size: -'

'Name: 6e/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 6f/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 7a/, Last Modified: 2025-02-22 22:56, Size: -'

'Name: 7b/, Last Modified: 2025-02-21 19:54, Size: -'

'Name: 7c/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: 7d/, Last Modified: 2025-02-22 22:58, Size: -'

'Name: 7e/, Last Modified: 2025-02-23 22:53, Size: -'

'Name: 7f/, Last Modified: 2025-02-22 22:56, Size: -'

'Name: 8a/, Last Modified: 2025-02-24 12:53, Size: -'

'Name: 8b/, Last Modified: 2025-02-24 08:54, Size: -'

'Name: 8c/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: 8d/, Last Modified: 2025-02-22 22:57, Size: -'

'Name: 8e/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 8f/, Last Modified: 2025-02-24 13:52, Size: -'

'Name: 9a/, Last Modified: 2025-02-22 23:31, Size: -'

'Name: 9b/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 9c/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 9d/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 9e/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 9f/, Last Modified: 2025-02-22 22:58, Size: -'

'Name: 10/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: 11/, Last Modified: 2025-02-24 10:53, Size: -'

'Name: 12/, Last Modified: 2025-02-22 22:58, Size: -'

'Name: 13/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 14/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 15/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 16/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 17/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 18/, Last Modified: 2025-02-24 13:52, Size: -'

'Name: 19/, Last Modified: 2025-02-23 05:52, Size: -'

'Name: 20/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 21/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 22/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: 23/, Last Modified: 2025-02-24 12:53, Size: -'

'Name: 24/, Last Modified: 2025-02-23 22:53, Size: -'

'Name: 25/, Last Modified: 2025-02-21 12:03, Size: -'

'Name: 26/, Last Modified: 2025-02-23 15:53, Size: -'

'Name: 27/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 28/, Last Modified: 2025-02-24 08:53, Size: -'

'Name: 29/, Last Modified: 2025-02-22 07:52, Size: -'

'Name: 30/, Last Modified: 2025-02-23 09:53, Size: -'

'Name: 31/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 32/, Last Modified: 2025-02-22 03:52, Size: -'

'Name: 33/, Last Modified: 2025-02-22 15:53, Size: -'

'Name: 34/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 35/, Last Modified: 2025-02-22 22:57, Size: -'

'Name: 36/, Last Modified: 2025-02-22 22:58, Size: -'

'Name: 37/, Last Modified: 2025-02-22 07:52, Size: -'

'Name: 38/, Last Modified: 2025-02-24 08:52, Size: -'

'Name: 39/, Last Modified: 2025-02-24 08:54, Size: -'

'Name: 40/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 41/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 42/, Last Modified: 2025-02-24 09:31, Size: -'

'Name: 43/, Last Modified: 2025-02-25 07:52, Size: -'

'Name: 44/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 45/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 46/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 47/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 48/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 49/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 50/, Last Modified: 2025-02-21 18:53, Size: -'

'Name: 51/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: 52/, Last Modified: 2025-02-25 01:52, Size: -'

'Name: 53/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 54/, Last Modified: 2025-02-22 22:58, Size: -'

'Name: 55/, Last Modified: 2025-02-23 20:52, Size: -'

'Name: 56/, Last Modified: 2025-02-25 01:52, Size: -'

'Name: 57/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: 58/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 59/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 60/, Last Modified: 2025-02-25 07:53, Size: -'

'Name: 61/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: 62/, Last Modified: 2025-02-23 22:53, Size: -'

'Name: 63/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 64/, Last Modified: 2025-02-23 01:54, Size: -'

'Name: 65/, Last Modified: 2025-02-23 17:53, Size: -'

'Name: 66/, Last Modified: 2025-02-22 10:54, Size: -'

'Name: 67/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 68/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: 69/, Last Modified: 2025-02-23 03:52, Size: -'

'Name: 70/, Last Modified: 2025-02-23 09:53, Size: -'

'Name: 71/, Last Modified: 2025-02-24 10:53, Size: -'

'Name: 72/, Last Modified: 2025-02-17 21:54, Size: -'

'Name: 73/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 74/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 75/, Last Modified: 2025-02-23 01:54, Size: -'

'Name: 76/, Last Modified: 2025-02-21 19:54, Size: -'

'Name: 77/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 78/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 79/, Last Modified: 2025-02-21 13:55, Size: -'

'Name: 80/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 81/, Last Modified: 2025-02-25 00:54, Size: -'

'Name: 82/, Last Modified: 2025-02-22 23:01, Size: -'

'Name: 83/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: 84/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 85/, Last Modified: 2025-02-25 01:52, Size: -'

'Name: 86/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: 87/, Last Modified: 2025-02-22 07:52, Size: -'

'Name: 88/, Last Modified: 2025-02-23 14:53, Size: -'

'Name: 89/, Last Modified: 2025-02-25 07:53, Size: -'

'Name: 90/, Last Modified: 2025-02-25 04:53, Size: -'

'Name: 91/, Last Modified: 2025-02-21 23:52, Size: -'

'Name: 92/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: 93/, Last Modified: 2025-02-23 22:53, Size: -'

'Name: 94/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 95/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: 96/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: 97/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: 98/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: 99/, Last Modified: 2025-02-22 13:52, Size: -'

'Name: README, Last Modified: 2023-09-14 04:14, Size: 96'

'Name: a0/, Last Modified: 2025-02-24 04:54, Size: -'

'Name: a1/, Last Modified: 2025-02-22 22:52, Size: -'

'Name: a2/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: a3/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: a4/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: a5/, Last Modified: 2025-02-22 22:55, Size: -'

'Name: a6/, Last Modified: 2025-02-25 07:53, Size: -'

'Name: a7/, Last Modified: 2025-02-24 22:54, Size: -'

'Name: a8/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: a9/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: aa/, Last Modified: 2025-02-23 15:53, Size: -'

'Name: ab/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: ac/, Last Modified: 2025-02-22 23:53, Size: -'

'Name: ad/, Last Modified: 2025-02-21 17:53, Size: -'

'Name: ae/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: af/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: b0/, Last Modified: 2025-02-22 07:52, Size: -'

'Name: b1/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: b2/, Last Modified: 2025-02-24 04:52, Size: -'

'Name: b3/, Last Modified: 2025-02-24 19:54, Size: -'

'Name: b4/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: b5/, Last Modified: 2025-02-23 23:53, Size: -'

'Name: b6/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: b7/, Last Modified: 2025-02-22 07:52, Size: -'

'Name: b8/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: b9/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: ba/, Last Modified: 2025-02-24 22:54, Size: -'

'Name: bb/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: bc/, Last Modified: 2025-02-22 07:52, Size: -'

'Name: bd/, Last Modified: 2025-02-23 15:53, Size: -'

'Name: be/, Last Modified: 2025-02-25 07:53, Size: -'

'Name: bf/, Last Modified: 2025-02-23 05:52, Size: -'

'Name: c0/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: c1/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: c2/, Last Modified: 2025-02-24 04:52, Size: -'

'Name: c3/, Last Modified: 2025-02-22 22:57, Size: -'

'Name: c4/, Last Modified: 2025-02-23 23:53, Size: -'

'Name: c5/, Last Modified: 2025-02-23 03:52, Size: -'

'Name: c6/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: c7/, Last Modified: 2025-02-24 09:53, Size: -'

'Name: c8/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: c9/, Last Modified: 2025-02-22 22:53, Size: -'

'Name: ca/, Last Modified: 2025-02-24 12:53, Size: -'

'Name: cb/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: cc/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: cd/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: ce/, Last Modified: 2025-02-23 03:52, Size: -'

'Name: cf/, Last Modified: 2025-02-24 10:53, Size: -'

'Name: d0/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: d1/, Last Modified: 2025-02-23 22:53, Size: -'

'Name: d2/, Last Modified: 2025-02-24 05:55, Size: -'

'Name: d3/, Last Modified: 2025-02-22 23:31, Size: -'

'Name: d4/, Last Modified: 2025-02-22 23:00, Size: -'

'Name: d5/, Last Modified: 2025-02-25 04:52, Size: -'

'Name: d6/, Last Modified: 2025-02-24 19:52, Size: -'

'Name: d7/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: d8/, Last Modified: 2025-02-23 08:57, Size: -'

'Name: d9/, Last Modified: 2025-02-22 23:31, Size: -'

'Name: da/, Last Modified: 2025-02-22 22:53, Size: -'

'Name: db/, Last Modified: 2025-02-22 20:53, Size: -'

'Name: dc/, Last Modified: 2025-02-22 18:52, Size: -'

'Name: dd/, Last Modified: 2025-02-24 12:53, Size: -'

'Name: de/, Last Modified: 2025-02-23 03:52, Size: -'

'Name: df/, Last Modified: 2025-02-24 08:52, Size: -'

'Name: e0/, Last Modified: 2025-02-22 23:52, Size: -'

'Name: e1/, Last Modified: 2025-02-21 17:52, Size: -'

'Name: e2/, Last Modified: 2025-02-25 07:53, Size: -'

'Name: e3/, Last Modified: 2025-02-25 04:53, Size: -'

'Name: e4/, Last Modified: 2025-02-24 04:52, Size: -'

'Name: e5/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: e6/, Last Modified: 2025-02-22 23:01, Size: -'

'Name: e7/, Last Modified: 2025-02-24 04:52, Size: -'

'Name: e8/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: e9/, Last Modified: 2025-02-24 05:53, Size: -'

'Name: ea/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: eb/, Last Modified: 2025-02-22 02:52, Size: -'

'Name: ec/, Last Modified: 2025-02-22 16:54, Size: -'

'Name: ed/, Last Modified: 2025-02-23 19:52, Size: -'

'Name: ee/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: ef/, Last Modified: 2025-02-25 04:52, Size: -'

'Name: f0/, Last Modified: 2025-02-24 16:54, Size: -'

'Name: f1/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: f2/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: f3/, Last Modified: 2025-02-23 22:52, Size: -'

'Name: f4/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: f5/, Last Modified: 2025-02-25 08:52, Size: -'

'Name: f6/, Last Modified: 2025-02-22 23:53, Size: -'

'Name: f7/, Last Modified: 2025-02-24 08:52, Size: -'

'Name: f8/, Last Modified: 2025-02-24 00:53, Size: -'

'Name: f9/, Last Modified: 2025-02-23 21:54, Size: -'

'Name: fa/, Last Modified: 2025-02-22 22:59, Size: -'

'Name: fb/, Last Modified: 2025-02-22 13:54, Size: -'

'Name: fc/, Last Modified: 2025-02-24 04:52, Size: -'

'Name: fd/, Last Modified: 2025-02-23 14:53, Size: -'

'Name: fe/, Last Modified: 2025-02-24 14:53, Size: -'

'Name: ff/, Last Modified: 2025-02-22 08:52, Size: -'

'Name: fifo-cronolog-1.1.1.tar.bz2, Last Modified: 2010-10-11 20:45, Size: 3.1K'

'Name: fifo-cronolog-1.2.3.tar.gz, Last Modified: 2024-03-13 05:20, Size: 4.4K'

'Name: floppym-test.txt, Last Modified: 2024-03-13 19:23, Size: 15'

'Name: layout.conf, Last Modified: 2023-09-14 05:29, Size: 38'

'Name: safecat-1.13-clang-fixes.patch, Last Modified: 2024-11-30 17:04, Size: 35K'

'Name: smartctl_exporter-0.12.0-vendor.tar.xz, Last Modified: 2024-05-16 05:40, Size: 1.3M'

'Name: timestamp.dev-local, Last Modified: 2025-02-25 08:45, Size: 49'

'Name: timestamp.mirmon, Last Modified: 2025-02-25 08:54, Size: 11'

## SQL Question

### We have:

### ● a machine learning binary classifier which takes as input an image and outputs the image quality score (from 0 to 1, where scores closer to 0 represent low-quality images, and scores closer to 1 represent high-quality images).

### ● a SQL table containing 1M unlabeled images. We run each of these images through our machine learning model to get float scores from 0 to 1 for each image. We want to prepare a new training set with some of these unlabeled images. An example of unlabeled_image_predictions (1M rows) is shown below:


| image_id | score |
|---|---|
| 242 | 0.23 |
| 123 | 0.92 |
| 248 | 0.88 |
| ... | ... | 

### Our sampling strategy is to order the images in decreasing order of scores and sample every 3rd image starting with the first from the beginning until we get 10k positive samples. And we would like to do the same in the other direction, starting from the end to get 10k negative samples.

#### Task: 

Write a SQL query that performs this sampling and creates the expected output
ordered by image_id with integer columns image_id, weak_label.

Feel free to develop in DB-Fiddle [DB-Fiddle](https://www.db-fiddle.com/) with PostgresSQL v12 or your own sql sandbox

![image3](../Screenshot%202025-02-24%20225517.png)


### SOLUTION

Using `Sqlite3` to perform the sampling and create the desired output using `SQL`. Since we want to `sample in two directions`, the solution requires a `common table expression (CTE)` and `window functions`.

The query assumes there are at least `20,000 rows` in the `unlabeled_image_predictions table` to get `10,000 samples` in each direction. The modulo value in the WHERE clauses `WHERE rn_desc % 3 = 1 - selects 1 in 3 samples`

This type of sampling is useful for reducing a large dataset to a smaller, more manageable size while attempting to maintain some representation of the overall data distribution.  In this particular case, it selects approximately one-third of the images ordered by their score in descending order.

**Defining a threshold for what constitutes "close" to 1 and 0. I assume:**

- Scores greater than or equal to 0.75 will be considered high-quality.
- Scores less than or equal to 0.25 will be considered low-quality.

In [1]:
import sqlite3
import pandas as pd
from datetime import datetime 

In [5]:
db_name = 'ml.db'
table_name = 'unlabeled_image_predictions'
csv_path = './ml_data.csv'
db_log_file = './log.txt'

In [6]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open(db_log_file,"a") as f: 
        f.write(timestamp + ' : ' + message + '\n')

In [7]:
# function to run query against the ml.db
def run_query(query_statement, sql_connection):
    display("Query Statement", query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    display("Query Output:", query_output)

In [8]:

log_progress('Preliminaries complete. Initiating ELT process')

# Read sample data
df = pd.read_csv(csv_path)

display("Data:", df)

# TESTING
# Display the DataFrame sorted by score in descending order
sorted_df = df.sort_values(by='score', ascending=False)

# Display the sorted DataFrame
display("Data sorted in descending order based on Score", sorted_df)

log_progress('Data extraction complete. Initiating Transformation process')

# Create a SQLite database
conn = sqlite3.connect(db_name)

log_progress('SQL Connection initiated.')

df.to_sql(table_name, conn, if_exists='replace', index=False)

log_progress('Data loaded to Database as table. Running the query')

# Execute the SQL query
query_statement = f"""WITH RankedImages AS (
    SELECT
        image_id,
        score,
        ROW_NUMBER() OVER (ORDER BY score DESC) AS rn_desc,
        ROW_NUMBER() OVER (ORDER BY score ASC) AS rn_asc
    FROM {table_name}
    WHERE score BETWEEN 0 AND 1
),
PositiveSamples AS (
    SELECT
        image_id,
        1 AS weak_label
    FROM RankedImages
    WHERE rn_desc % 3 = 1 AND score >= 0.75
    LIMIT 10000
),
NegativeSamples AS (
    SELECT
        image_id,
        0 AS weak_label
    FROM RankedImages
    WHERE rn_asc % 3 = 1 AND score <= 0.25
    LIMIT 10000
)
SELECT image_id, weak_label FROM PositiveSamples
UNION ALL
SELECT image_id, weak_label FROM NegativeSamples
ORDER BY image_id
"""
run_query(query_statement, conn)

log_progress('Process Complete.')

# Close the connection
conn.close()

'Data:'

Unnamed: 0,image_id,score
0,242,0.23
1,123,0.92
2,248,0.88
3,100,0.1
4,500,0.95
5,300,0.85
6,400,0.75
7,600,0.98
8,700,0.82
9,800,0.15


'Data sorted in descending order based on Score'

Unnamed: 0,image_id,score
12,1100,0.99
7,600,0.98
4,500,0.95
1,123,0.92
2,248,0.88
5,300,0.85
8,700,0.82
6,400,0.75
11,1000,0.35
10,900,0.25


'Query Statement'

'WITH RankedImages AS (\n    SELECT\n        image_id,\n        score,\n        ROW_NUMBER() OVER (ORDER BY score DESC) AS rn_desc,\n        ROW_NUMBER() OVER (ORDER BY score ASC) AS rn_asc\n    FROM unlabeled_image_predictions\n    WHERE score BETWEEN 0 AND 1\n),\nPositiveSamples AS (\n    SELECT\n        image_id,\n        1 AS weak_label\n    FROM RankedImages\n    WHERE rn_desc % 3 = 1 AND score >= 0.75\n    LIMIT 10000\n),\nNegativeSamples AS (\n    SELECT\n        image_id,\n        0 AS weak_label\n    FROM RankedImages\n    WHERE rn_asc % 3 = 1 AND score <= 0.25\n    LIMIT 10000\n)\nSELECT image_id, weak_label FROM PositiveSamples\nUNION ALL\nSELECT image_id, weak_label FROM NegativeSamples\nORDER BY image_id\n'

'Query Output:'

Unnamed: 0,image_id,weak_label
0,123,1
1,242,0
2,700,1
3,1100,1
4,1200,0
