In [1]:
from bs4 import BeautifulSoup             #Beautiful Soup is a Python library for pulling data out of HTML
                                         #It provides Pythonic idioms for iterating, searching, and modifying the parse tree.
import glob
import re
import os
import pandas as pd

In [2]:
#Print the current working directory
print("Current Working Directory:", os.getcwd())         # "getcwd" stands for "get current working directory."
                                                         #it will display the current working directory in the 
                                                         #console where the Python script is executed.

Current Working Directory: C:\Users\Alaa_Abdallah\OneDrive\Desktop\Play_Ground2


In [3]:
# Open the file in read mode   **read from just one file
with open('data/500118.txt', 'r', encoding='utf-8') as file:  
    # Read the HTML content from the file
    html_content = file.read()

In [4]:
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [5]:
#Open the file in read mode
with open('data/500118.txt', 'r', encoding='utf-8') as file:
    # Read and process each line
    for line in file:
        print(line, end='') 

<!DOCTYPE html>
<html lang="ar">

<head>
  <title>كيا اوبتيما موديل سنة 2014 | شو بدك من فلسطين؟</title>
<meta name="description" content="السيارة بحالة ممتازة، جميع الاضافات وفتحة سقف بانوراما لون أسود. السعر 100 ألف شيكل، يشمل تأمين شامل 9 شهور وترخيص 9 شهور.">
<link rel="canonical" href="https://shobiddak.com/cars?q[car_model_id_eq]=20&amp;q[car_sub_model_id_eq]=72">
<meta property="og:site_name" content="شو بدك من فلسطين؟">
<meta property="og:title" content="كيا اوبتيما موديل سنة 2014">
<meta property="og:image" content="https://shobiddak.com/system/cars_logos/kia.jpg">
<meta property="og:url" content="https://shobiddak.com/cars/500118">
<meta property="og:type" content="website">

  

  <link rel="shobiddak شوبدك من فلسطين؟" type="image/x-icon" href="/favicon.ico" >
  <link rel="apple-touch-icon" href="/apple-touch-icon.png" />
  <link rel="stylesheet" type="text/css" href="https://fonts.googleapis.com/earlyaccess/droidarabickufi" />
  <link rel="stylesheet" href="https://cdnjs.cl

In [6]:
# Function to convert Arabic numerals to English numerals
def arabic_to_english(text):
    arabic_numerals = {'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4', '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'}
    
    if isinstance(text, str):
        for arabic, english in arabic_numerals.items():
            text = text.replace(arabic, english)
    
    return text

In [7]:
# Create empty lists to store data
file_names = []
car_names = []
model_years = []
prices = []
car_colors = []
fuel_types = []
car_origins = []
driving_licenses = []
lime_types = []
glass_types = []
motor_powers = []
car_speedometers = []
passenger_numbers = []
payment_methods = []
shown_statuses = []
previous_owners = []
extras = []

In [8]:
folder_path = 'data'

In [9]:
# Get a list of file paths matching a pattern (all txt files in the directory)
file_paths = sorted(glob.glob(os.path.join(folder_path, '*.txt')))

In [10]:
# Loop through all files
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read and process the entire content of the file
        content = file.read()

        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(content, 'html.parser')

        # Extracting information from the table
        car_name = soup.h3.text.strip()
        
        model_year = soup.h5.text.strip()
        numeric_year = re.search(r'\b\d{4}\b', model_year)
        model_year = numeric_year.group() if numeric_year else ' '
        
        price_element = soup.find('h5', class_='post-price')
        price = price_element.contents[0].strip() if price_element else ' '
 
        # Remove '(شيكل)' from the price
        price = price.replace('شيكل', '')
         
        # Extracting additional information
        additional_info = None
        additional_info_table = soup.find('table', class_='create_post')
        if additional_info_table:
            ul_element = additional_info_table.find('ul')
            if ul_element:
                additional_info = [li.text.strip() for li in ul_element.find_all('li')]

        # Extracting data
        data = {}            
        data_table = soup.find('table', class_='list_ads')
        if data_table:
            rows = data_table.find_all('tr')
            for row in rows[1:]:  # Skip the first row (section_title)
                columns = row.find_all('td')
                if len(columns) == 2:
                    data_value = columns[0].text.strip()
                    info = columns[1].text.strip()
                    # Remove '+1' from Passenger Number
                    if data_value == 'عدد الركاب':
                        info = info.replace('+1', '')
                    data[data_value] = info

        # Storing information separately in variables
        file_names.append(os.path.basename(file_path))
        car_names.append(car_name)
        model_years.append(model_year)
        prices.append(price)
        
        car_colors.append(data.get('لون السيارة', ''))
        fuel_types.append(data.get('نوع الوقود', ''))
        car_origins.append(data.get('أصل السيارة', ''))
        driving_licenses.append(data.get('رخصة السيارة', ''))
        lime_types.append(data.get('نوع الجير', ''))
        glass_types.append(data.get('الزجاج', ''))
        motor_powers.append(data.get('قوة الماتور', ''))
        
        # Convert Arabic numerals to English in 'Car Speedometer' and 'Passenger Number' columns          
        car_speedometer = arabic_to_english(data.get('عداد السيارة', ''))
        passenger_number = arabic_to_english(data.get('عدد الركاب', ''))
        Previous_Owner = arabic_to_english(data.get('أصحاب سابقون', ''))
        
        car_speedometers.append(car_speedometer)
        passenger_numbers.append(passenger_number)
        previous_owners.append(Previous_Owner)
        
        
        payment_methods.append(data.get('وسيلة الدفع', '').replace('فقط', ''))
        shown_statuses.append(data.get('معروضة', '').replace('فقط', ''))

         # Save 'فتحة سقف' in the "Extras" column if found
        extras.append('فتحة سقف' if 'فتحة سقف' in data.get('إضافات', '') else '')

In [11]:
# Create a DataFrame from the lists
df = pd.DataFrame({
    'File Name': file_names,
    'Car Name': car_names,
    'Model Year': model_years,
    'Price': prices,
    'Car Color': car_colors,
    'Fuel Type': fuel_types,
    'Car Origin': car_origins,
    'Driving License': driving_licenses,
    'Lime Type': lime_types,
    'Glass Type': glass_types,
    'Motor Power': motor_powers,
    'Car Speedometer': car_speedometers,
    'Passenger Number': passenger_numbers,
    'Payment Method': payment_methods,
    'Shown Status': shown_statuses,
    'Previous Owners': previous_owners,
    'Extras': extras
})

In [12]:
# Display the DataFrame
df

Unnamed: 0,File Name,Car Name,Model Year,Price,Car Color,Fuel Type,Car Origin,Driving License,Lime Type,Glass Type,Motor Power,Car Speedometer,Passenger Number,Payment Method,Shown Status,Previous Owners,Extras
0,500118.txt,كيا اوبتيما,2014,100000,أبيض عاجي,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,2000,75000,4,نقدا,للبيع,يد اولى,فتحة سقف
1,500125.txt,معرض السيارات,,,,,,,,,,,,,,,
2,500132.txt,معرض السيارات,,,,,,,,,,,,,,,
3,500163.txt,كيا سورينتو,2007,60000,سكني,ديزل,خصوصي,فلسطينية,نصف اوتوماتيك,الكتروني,2500,130000,7,إمكانية التقسيط,للبيع أو التبديل,2,فتحة سقف
4,500226.txt,هونداي افانتي,2006,43500,سكني,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,1600,,,نقدا,للبيع,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7772,777932.txt,فورد ترانزيت,2002,48000,أبيض,ديزل,عمومي,فلسطينية,عادي,الكتروني,2400,00000,7,نقدا,للبيع,5,
7773,777937.txt,بيجو بارتنر,2018,87000,فضي,ديزل,خصوصي,فلسطينية,عادي,الكتروني,1600,50000,4,نقدا,للبيع,يد صفر,
7774,777960.txt,كيا سورينتو,2017,126000,بترولي,ديزل,خصوصي,فلسطينية,اوتوماتيك,الكتروني,2200,100,6,نقدا,للبيع,ثانيه,فتحة سقف
7775,777963.txt,معرض السيارات,,,,,,,,,,,,,,,


In [13]:
# Save the DataFrame to an Excel file              
df.to_excel('Data_file.xlsx', index=False)