In [1]:
################################################ main #####################################################
"""
The project is divided into two parts:

First Part: Extract all car models (ListeBrandDic) for each brand and each year from a target website. During this phase, 
            the variable is_for_extract_models is set to True.

Second Part: Launch multiple threads to work in parallel on the models extracted in the first part (ListeBrandDic). 
             In this phase, threads process the models in SharedListBrandDic. Each thread takes a model from SharedListBrandDic 
             and extracts information about car parts from the target website. The variable is_for_extract_models is set to False 
             to indicate that this part of the script is in progress.

At the beginning, I initialize the variable is_for_extract_models to True to start the first part. After completing the first part, 
I change the value of is_for_extract_models to False and run the script to begin the second part.
"""
###########################################################################################################

import threading
import time
import os
from SharedListBrandDic import SharedListBrandDic
from WorkerThread import WorkerThread
from ProxyLoader import ProxyLoader


# First Part
is_for_extract_models = True

if is_for_extract_models: # First Part
    list_of_targeted_brands = ["BMW","JAGUAR","JEEP","KIA","LAND ROVER","LEXUS","MAZDA","MERCEDES-BENZ",
                               "MG","MINI","MITSUBISHI","NISSAN","OPEL","RENAULT","SUZUKI","TOYOTA","VOLKSWAGEN",
                               "VOLVO","MERCURY"
                                ]
    for brand in list_of_targeted_brands:
        thread = WorkerThread(1, start_brand=brand, end_brand=brand, is_for_extract_models=is_for_extract_models)
        thread.start()
        thread.join()
        break
    
else: # Second Part
    file_path = r"./proxies/Webshare 100 proxies.txt"
    proxy_loader = ProxyLoader(file_path)
    # Créer une liste partagée, une file de résultats et un verrou CSV
    shared_list = SharedListBrandDic()
    # Créer et démarrer les threads
    num_threads = 100
    
    start = 0
    end = min(shared_list.nb_BrandDic,shared_list.nb_BrandDic)
    step = shared_list.nb_BrandDic
    
    for i in range(start, end, step):
        
        shared_list.select_data(i,i+step)
        
        threads = []
        for j in range(num_threads):
            thread = WorkerThread(j+1, shared_list=shared_list, proxyLoader=proxy_loader)
            thread.start()
            threads.append(thread)
            time.sleep(3)
            
        # Attendre que tous les threads se terminent
        for thread in threads:
            thread.join()
        
        print(f"%"*100)
        print("Brand_Dic finished : ",i+step)
        print(f"%"*100)

g2004 clicked
select_en clicked
304
4
131
in PartInfoPro : BMW
ACURA
ALFA ROMEO
ASTON MARTIN
AUDI
AVANTI
BAIC
BENTLEY
BMW
**2024
****118I
****128TI
****220I
****228I
****230I
****320I
****330E
****330I
****430I
****530E
****530I
****540I
****740I
****750E
****760I
****840I
****ALPINA B8
****ALPINA XB7
****I4
****I5
****I7
****IX
****IX1
****IX3
****M135I
****M2
****M235I
****M240I
****M3
****M340I
****M4
****M440I
****M8
****M850I
****X1
****X2
****X3
****X4
****X5
****X6
****X7
****XM
****Z4
**2023
****118I
****128TI
****220I
****228I
****230I
****320I
****330E
****330I
****430I
****440I
****530E
****530I
****540I
****740I
****760I
****840I
****ALPINA B8
****ALPINA XB7
****I4
****I7
****IX
****M135I
****M2
****M235I
****M240I
****M3
****M340I
****M4
****M440I
****M5
****M550I XDRIVE
****M8
****M850I
****X1
****X2
****X3
****X4
****X5
****X6
****X7
****XM
****Z4
**2022
****118I
****128TI
****220I
****228I
****230I
****320I
****330E
****330I
****430I
****440I
****530E
****530I
****540I


In [1]:
!rm -r results/C*

In [2]:
!rm -r images/*

In [None]:
brands_18 = ['ACURA','AUDI','BMW','CADILLAC','CHANGAN','CHEVROLET','CHIREY','CHRYSLER','DODGE','FIAT','FORD','GENESIS','GMC','HONDA','HYUNDAI','INFINITI','ISUZU','JAC']

# Data Processing

In [None]:
import pandas as pd
import glob
import os
import re

# Définir le chemin du dossier contenant les fichiers CSV
dossier_results = 'results'

# Créer une liste de fichiers CSV correspondant au motif spécifié
fichiers_csv = glob.glob(os.path.join(dossier_results, 'ACURA_*.csv'))

# Fonction pour extraire le numéro du fichier CSV
def extraire_numero(fichier):
    match = re.search(r'ACURA_(\d+)\.csv', fichier)
    return int(match.group(1)) if match else float('inf')

# Trier les fichiers CSV par numéro extrait
fichiers_csv.sort(key=extraire_numero)
fichiers_csv

In [None]:
# Créer une liste pour stocker les DataFrames
list_dataframes = []

# Lire chaque fichier CSV et ajouter son DataFrame à la liste
for fichier_csv in fichiers_csv:
    df = pd.read_csv(fichier_csv)
    list_dataframes.append(df)

# Concaténer tous les DataFrames en un seul DataFrame
df_combined = pd.concat(list_dataframes, ignore_index=True)

# Sauvegarder le DataFrame combiné dans un nouveau fichier CSV
df_combined.to_csv(os.path.join(dossier_results, 'first_ACURA_result.csv'), index=False)

print("Les fichiers ont été combinés avec succès et sauvegardés sous 'first_ACURA_result.csv'.")


## DATA Troceforming and Filtring

In [None]:
import pandas as pd
import math

# Définir le chemin du fichier CSV
chemin_fichier = 'results/first_ACURA_result.csv'

# Lire le fichier CSV dans un DataFrame
df = pd.read_csv(chemin_fichier, index_col=False)

df

In [None]:
set(df['Year'])

In [None]:
df["price"].value_counts()[:15]

In [None]:
# Taux de conversion de Dollar à Dinar Koweitien (exemple : 1 USD = 0.31 KWD)
conversion_rate = 0.31

# Fonction de transformation
def transform_price(value):
    # Vérifier si la valeur contient un chiffre
    if any(char.isdigit() for char in value):
        # Extraire le montant en utilisant une expression régulière
        amount = float(re.findall(r"[-+]?\d*\.\d+|\d+", value)[0])
        # Multiplier par 2
        amount *= 2
        # Convertir en Dinar Koweitien
        amount *= conversion_rate
        # Arrondir à l'entier le plus proche
        return math.ceil(amount)
    else:
        # Retourner None si la valeur ne contient pas de chiffre
        return value

In [None]:
# Appliquer la transformation à la colonne 'price'
df['first_price'] = df['price']
df['price'] = df['price'].apply(transform_price)

df

In [None]:
print(df.columns) 

In [None]:
df["frontOrRear"].value_counts()[:50]

In [None]:
# Définir les expressions régulières pour extraire les catégories pertinentes
def clean_front_or_rear(value):
    # Liste des motifs de correspondance
    patterns = {
        'Front Right': r'\bFront Right\b',
        'Front Left': r'\bFront Left\b',
        
        'Rear Right': r'\bRear Right\b',
        'Rear Left': r'\bRear Left\b',
        
        'Front': r'\bFront\b',
        'Rear': r'\bRear\b',
        
        'Left': r'\bLeft\b',
        'Right': r'\bRight\b',
    }
    
    # Vérifier et retourner la première correspondance trouvée
    for key, pattern in patterns.items():
        if re.search(pattern, str(value), re.IGNORECASE):
            return key
    
    # Si aucune correspondance trouvée, retourner None
    return None

In [None]:
# Appliquer la fonction de nettoyage à la colonne
df['first_frontOrRear'] = df['frontOrRear']
df['frontOrRear'] = df['frontOrRear'].apply(clean_front_or_rear)

In [None]:
df[['frontOrRear','first_frontOrRear']][:50]

In [None]:
df.shape

In [None]:
# Fonction pour vérifier si une valeur est un nombre entier ou 'Out of Stock'
def is_valid_price(value):
    try:
        if str(value) == 'Out of Stock':  # Vérifie si c'est 'Out of Stock'
            return True
        if isinstance(int(value), int):  # Vérifie si c'est un entier
            return True
        return False
    except Exception as e:
        return False

# Filtrer les lignes en gardant celles qui sont valides
df_filtered = df[df['price'].apply(is_valid_price)]

df_filtered.shape

In [None]:
df_filtered.head()

In [None]:
df_filtered['Year'].value_counts()

In [None]:
df_filtered.shape

In [None]:
columns_to_drop = ['first_img_src', 'first_price', 'first_frontOrRear']
df_filtered = df_filtered.drop(columns=columns_to_drop)

In [None]:
df_filtered.shape

In [None]:
df_filtered.to_csv("./results/new_results/ACURA_with_transformed_data.csv", index=False)

In [None]:
import pandas as pd
df = pd.read_csv("./results/new_results/ACURA_with_transformed_data.csv")

In [None]:
len(list(df["manufacturer_part_number"]))

In [None]:
len(set(list(df["manufacturer_part_number"])))