In [2]:
rm -r ./images/BMW

In [1]:
import threading

class SharedListBrandDic:
    def __init__(self, csv_file):
        self.csv_file = csv_file
        self.allRows = []
        self.data = []
        self.nb_rows = 0
        self.new_rows = []
        self.error_rows = []
        self.lock = threading.Lock()
        self.lock_new_rows = threading.Lock()
        self.lock_error_rows = threading.Lock()
        
        self.load_data()
        
    def load_data(self) :
        with open(self.csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.allRows.append(row)
        self.nb_rows = len(self.allRows)
        
    def select_data(self, start, end):
        if end < self.nb_rows:
            self.data = self.data + self.allRows[start:end]
        else:
            self.data = self.data + self.allRows[start:self.nb_rows]

In [2]:
[1,3,"11"] + [1.2,"1"]

[1, 3, '11', 1.2, '1']

In [3]:
import threading
import os

class ProxyLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.proxies = []
        self.load_proxies()
        self.lock = threading.Lock()
        
    def load_proxies(self):
        if not os.path.isfile(self.file_path):
            raise FileNotFoundError(f"Le fichier {self.file_path} n'existe pas.")
        
        with open(self.file_path, 'r') as file:
            lines = file.readlines()
            self.proxies = []
            for line in lines:
                parts = line.strip().split(':')
                if len(parts) == 4:
                    proxy_dict = {
                        "PROXY_HOST": parts[0],
                        "PROXY_PORT": parts[1],
                        "PROXY_USER": parts[2],
                        "PROXY_PASS": parts[3]
                    }
                    self.proxies.append(proxy_dict)
                else:
                    print(f"Format de ligne incorrect: {line.strip()}")

    def get_proxy(self, index):
        # Utilise l'opérateur modulo pour garantir que l'indice reste dans les limites de la liste
        index = index % len(self.proxies)
        return self.proxies[index]

In [4]:
import threading
import requests
import os

class WorkerThread(threading.Thread):
    lock_save_image = threading.Lock()
    
    def __init__(self, thread_id, shared_list, proxyLoader):
        
        super(WorkerThread, self).__init__()
        
        self.thread_id = thread_id
        self.shared_list = shared_list
        self.proxyLoader = proxyLoader

        self.row = None
        self.row_without_error = True
        
        self.use_proxy = True
        self.proxy = None
        self.PROXY_HOST = None # rotating proxy or host
        self.PROXY_PORT = None # port
        self.PROXY_USER = None # username
        self.PROXY_PASS = None # password
        
    def run(self):
        while True:
            try:
                # Access the shared list synchronously
                with self.shared_list.lock:
                    if not self.shared_list.data:
                        break  # The list is empty, the thread ends

                    self.row = self.shared_list.data.pop(0)
                    
                with self.proxyLoader.lock:
                    self.proxy = self.proxyLoader.get_proxy(self.thread_id-1)
                
                #print(f"*"*100)
                #print(f"row = {self.row['products_name']} ||| by_thread_id = {self.thread_id} ||| with_proxy = {self.proxy}")
                #print(f"*"*100)
                
                self.init_proxy()
                self.start_scraping()

                if self.row_without_error :
                    with self.shared_list.lock_new_rows:
                        self.shared_list.new_rows.append(self.row)
                else:
                    with self.shared_list.lock_error_rows:
                        self.shared_list.error_rows.append(self.row)
                    
            except Exception as e:
                print(e)
                with self.shared_list.lock_error_rows:
                    self.shared_list.error_rows.append(self.row)
                
    def init_proxy(self):
        self.PROXY_HOST = self.proxy["PROXY_HOST"] # rotating proxy or host
        self.PROXY_PORT = self.proxy["PROXY_PORT"] # port
        self.PROXY_USER = self.proxy["PROXY_USER"] # username
        self.PROXY_PASS = self.proxy["PROXY_PASS"] # password
        
    def start_scraping(self):
        img_src = self.row.get('first_img_src')
        brand = self.row.get('brand_name')
        model = self.row.get('Model')
        part_name = self.row.get('products_name')
        code = self.row.get('manufacturer_part_number')

        if img_src:
            # print(img_src)
            # Télécharger et enregistrer l'image
            saved_img_path = self.save_img(img_src, brand, model, part_name, code)
            
            # Mettre à jour le chemin de l'image dans la colonne 'products_img'
            if saved_img_path:
                self.row['products_img'] = saved_img_path
                self.row_without_error = True
            else:
                self.row_without_error = False
                
    def save_img(self, img_src, brand, model, part_name, code):
        """ Télécharge l'image et l'enregistre sous un nom formaté. """
        try:
            # Créer le chemin du dossier pour les images si nécessaire
            brand_folder = f'./images/{self.nom_valide(brand)}'
            os.makedirs(brand_folder, exist_ok=True)
            
            # Nommer l'image
            new_img_src = f'{brand_folder}/{self.nom_valide(brand)}_{self.nom_valide(model)}_{self.nom_valide(part_name)}_{self.nom_valide(code)}.jpg'
            
            # Télécharger l'image
            ############################################### proxy #################################################################################
            if self.use_proxy :
                # Construct the proxy URL with authentication
                proxy = f"http://{self.PROXY_USER}:{self.PROXY_PASS}@{self.PROXY_HOST}:{self.PROXY_PORT}"

                # Set up the proxies dictionary
                proxies = {
                    "http": proxy,
                    "https": proxy,
                }
                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'}
                img_data = requests.get(img_src, headers=headers, proxies=proxies, timeout=10).content
            else:
                img_data = requests.get(img_src).content
                
            with WorkerThread.lock_save_image :
                with open(new_img_src, 'wb') as handler:
                    handler.write(img_data)
                
            # print(f"image saved by {self.thread_id}")
            # Retourner le chemin de l'image enregistré
            return new_img_src
        except Exception as e:
            print(f"Erreur lors du téléchargement de l'image {img_src}: {e}")
            return None

    def nom_valide(self, chaine):
        """ Nettoie et formate les noms pour créer des noms de fichier valides. """
        chaine = chaine.strip()
        chaine = re.sub(r'\s+', '_', chaine)
        chaine = re.sub(r'[^\w.-]', '_', chaine)
        return chaine

In [None]:
import os
import csv
import re
import requests
from pathlib import Path
import time


data_folder='data'


file_path = r"./proxies/Webshare 100 proxies.txt"
proxy_loader = ProxyLoader(file_path)

# Créer et démarrer les threads
num_threads = 100

for csv_file in Path(data_folder).glob('*.csv'):
    csv_filename = csv_file.stem  # Nom du fichier sans extension

    # Créer une liste partagée, une file de résultats et un verrou CSV
    shared_list = SharedListBrandDic(csv_file)

    start = 0
    end = min(shared_list.nb_rows,shared_list.nb_rows)
    step = 1000

    for i in range(start, end, step):
    
        shared_list.select_data(i,i+step)
        
        threads = []
        for j in range(num_threads):
            thread = WorkerThread(j+1, shared_list, proxy_loader)
            thread.start()
            threads.append(thread)
            #time.sleep(1)
            
        # Attendre que tous les threads se terminent
        for thread in threads:
            thread.join()
        
        print(f"%"*100)
        print(f"processed rows {i+step} | saved images : {len(shared_list.new_rows)} | error images : {len(shared_list.error_rows)}")
        print(f"%"*100)
        
        shared_list.data = shared_list.error_rows
        shared_list.error_rows = []
        
        threads = []
        for j in range(num_threads):
            thread = WorkerThread(j+1, shared_list, proxy_loader)
            thread.start()
            threads.append(thread)
            #time.sleep(1)
            
        # Attendre que tous les threads se terminent
        for thread in threads:
            thread.join()
        
        print(f"%"*100)
        print(f"processed rows {i+step} | saved images : {len(shared_list.new_rows)} | error images : {len(shared_list.error_rows)}")
        print(f"%"*100)
        
    # Écrire les nouvelles données dans un nouveau fichier CSV
    with open(f'./data/{csv_filename}_result.csv', 'w', newline='', encoding='utf-8') as f:
        fieldnames = shared_list.new_rows[0].keys()
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(shared_list.new_rows)

    print(f"Traitement terminé pour {csv_file}")

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
processed rows 1000 | saved images : 1000 | error images : 0
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
processed rows 1000 | saved images : 1000 | error images : 0
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
processed rows 2000 | saved images : 2000 | error images : 0
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
processed rows 2000 | saved images : 2000 | error images : 0
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [6]:
# Écrire les nouvelles données dans un nouveau fichier CSV
with open(f'{csv_file}_result', 'w', newline='', encoding='utf-8') as f:
    fieldnames = shared_list.new_rows[0].keys()
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(shared_list.new_rows)

In [7]:
shared_list.new_rows[0]

{'trader_id': '17',
 'description': "2.0L L4 Turbocharged Fuel & Air ['AUDI', 'A3', '2015-2024', 'AUDI', 'Q3', '2020-2023', 'AUDI', 'S3', '2015-2024', 'AUDI', 'TT', '2016-2023', 'SEAT', 'LEON', '2017', 'VOLKSWAGEN', 'ARTEON', '2019-2023', 'VOLKSWAGEN', 'GOLF', '2015-2021', 'VOLKSWAGEN', 'GOLF R', '2022', 'VOLKSWAGEN', 'GTI', '2015-2022', 'VOLKSWAGEN', 'JETTA', '2019-2022', 'VOLKSWAGEN', 'TIGUAN', '2018-2024']",
 'manufacturer_part_number': 'LX3502',
 'brand_name': 'AUDI',
 'address': '[{"address":"Shuwaikh Industrial, Kuwait","location":"https://maps.app.goo.gl/N2tDDcLCBw2xHk2Q9"}]',
 'products_img': './images/AUDI/AUDI_A3_فلتر_الهواء_LX3502.jpg',
 'madeIn': '',
 'price': '12',
 'is_offer': '0',
 'offer_price': '0',
 'offer_start_date': '',
 'offer_end_date': '',
 'rating': '0',
 'review_count': '0',
 'type': 'Standard Replacement ',
 'is_best_seller': '0',
 'is_mobilawy': '0',
 'assembly_kit': '1',
 'product_line': '',
 'frontOrRear': '',
 'tyre_speed_rate': '',
 'maximum_tyre_load': 

In [1]:
import pandas as pd

df = pd.read_csv("./data/ACURA.csv")

# Méthode 1: Utiliser .notna() pour les valeurs NaN
count_non_empty_1 = df['first_img_src'].notna().sum()

print(f'Nombre de lignes non vides dans "first_img_src" with meth1: {count_non_empty_1}')

Nombre de lignes non vides dans "first_img_src" with meth1: 30174  with meth2: 31035


In [6]:
!pwd

/home/jovyan/work/get_images_from_colonne_first_img_src


In [11]:
!chmod 777 images/*

In [14]:
import pandas as pd

df = pd.read_csv("./data/BMW_result.csv")

# Supprimer la colonne
df = df.drop(columns=['first_img_src'])

# Trier les données par la colonne 'Year'
df = df.sort_values(by='Year', ascending=False)

# Sauvegarder le DataFrame modifié dans un nouveau fichier CSV
df.to_csv('./data/BMW_final_result.csv',index=False)
df

Unnamed: 0,trader_id,description,manufacturer_part_number,brand_name,address,products_img,madeIn,price,is_offer,offer_price,...,products_name,warranty,disabled_at,updated_at,status,qtyInStock,deletedAt,Brand,Year,Model
0,63,3.0L L6 Turbocharged Body & Lamp Assembly ['BM...,BM1046159,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",,,23,0,0,...,حشوة الصدام,,,,Accepted,500,,BMW,2024,M440I
1443,63,"2.0L L4 Turbocharged Wheel ['BMW', '118I', '20...",610532,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_X2_دراسة_اللمبة_610532.jpg,,4,0,0,...,دراسة اللمبة,,,,Accepted,500,,BMW,2024,X2
1535,63,"3.0L L6 Turbocharged Suspension ['BMW', 'X3', ...",318349,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_X3_صدمة___دعامة_318349.jpg,,56,0,0,...,صدمة / دعامة,,,,Accepted,500,,BMW,2024,X3
1536,63,"3.0L L6 Turbocharged Interior ['BMW', 'X3', '2...",L1BM0920470,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_X3_حصيرة_الأرضية_L1BM0920470.jpg,,123,0,0,...,حصيرة الأرضية,,,,Accepted,500,,BMW,2024,X3
1537,63,"2.0L L4 Turbocharged Cooling System ['BMW', '2...",BM3014121,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_430I_خزان_سائل_التبريد_BM3014...,,51,0,0,...,خزان سائل التبريد,,,,Accepted,500,,BMW,2024,430I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130507,63,"2.5L L6 Body & Lamp Assembly ['BMW', '1 SERIES...",51247840617,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_Z4_أسطوانة_قفل_الصندوق_512478...,,20,0,0,...,أسطوانة قفل الصندوق,,,,Accepted,500,,BMW,2004,Z4
130506,63,"6.0L V12 Engine ['BMW', '550I', '2006-2010', '...",11427520214,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_760I_منظم_حرارة_الزيت_1142752...,,47,0,0,...,منظم حرارة الزيت,,,,Accepted,500,,BMW,2004,760I
130505,63,"2.2L L6 Body & Lamp Assembly ['BMW', '320I', '...",BM2510100,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_320I_باب_المصباح_الأمامي___ال...,,Out of Stock,0,0,...,باب المصباح الأمامي / الإطار,,,,Accepted,500,,BMW,2004,320I
130504,63,"2.2L L6 Body & Lamp Assembly ['BMW', '320I', '...",8D1946,BMW,"[{""address"":""Airport Rd, Al Shuwaikh 21540, Ku...",./images/BMW/BMW_320I_محرك_قفل_باب_الوقود_8D19...,,58,0,0,...,محرك قفل باب الوقود,,,,Accepted,500,,BMW,2004,320I


In [17]:
import os
import zipfile

def compress_folder_to_zip(folder_path, output_zip_path):
    # Créer un fichier ZIP
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Parcourir le dossier et ajouter les fichiers au zip
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Chemin absolu du fichier
                file_path = os.path.join(root, file)
                # Chemin relatif par rapport au dossier de base
                arcname = os.path.relpath(file_path, folder_path)
                # Ajouter le fichier au zip
                zipf.write(file_path, arcname)

    print(f"Dossier compressé avec succès en {output_zip_path}")

# Exemple d'utilisation
folder_path = "./images/BMW"
output_zip_path = "./images/BMW.zip"

compress_folder_to_zip(folder_path, output_zip_path)


Dossier compressé avec succès en ./images/BMW.zip


In [16]:
import zipfile
import os

def decompress_zip(zip_path, extract_to):
    # Vérifie si le fichier .zip existe
    if not os.path.exists(zip_path):
        print(f"Le fichier {zip_path} n'existe pas.")
        return
    
    # Crée le répertoire de destination s'il n'existe pas
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    
    # Ouvre le fichier .zip et extrait son contenu
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    print(f"Fichier décompressé avec succès dans {extract_to}")

# Exemple d'utilisation
zip_path = "./images/AUDI.zip"
extract_to = "./images/AUDIzip"

decompress_zip(zip_path, extract_to)


Fichier décompressé avec succès dans ./images/AUDIzip
