## Create dataset with web scraping 

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import re

* certain pages in the forum have a significant amount of data, and there is a need to prevent the dataset from being dominated or overwhelmed by comments from a single source.`max_comment_pages` is for this.
*  the website has two distinct structures: one for reviews related to all brands in general and another for reviews specific to a particular brand.Collect train and test data from a structure that includes comments for all brands,analysis data from a specific structure.Same comment can appear in both structures.Use a time limit to prevent for this.Analysis data consists of comments made in the last 6 months, while training data includes comments made more than 6 months ago.The `dataset_start_page` refers to the page containing comments made about 6 months ago.

In [19]:
base_url = "https://mini.donanimhaber.com/markalar--f338"
dataset_start_page = 140
total_pages = 2700
page_number = 1
file = "dataset.csv"
max_comments = 300000   
max_comment_pages = 200  
comment_number = 34395

For request cookies

In [3]:
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
     AppleWebKit/537.36 (KHTML, like Gecko) \
     Chrome/90.0.4430.212 Safari/537.36',
    'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7'
}

Scrape the data

In [4]:
def get_data(url):
    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            return response.text
        else:
            print("Request failed with status code:", response.status_code)
            return None
    except requests.exceptions.RequestException as e:
        print("An error occurred:", e)
        return None

Parse the data

In [5]:
def html_code(url, parser='html.parser'):
    htmldata = get_data(url)
    soup = BeautifulSoup(htmldata, 'html.parser')
    return soup

Extract the title links from current page
* The site's homepage contains links with comments. First, the links are parsed from the homepage.

In [6]:
 def extract_links(current_page_url):
    links = []
    global page_number
    soup = html_code(current_page_url)
    topics = soup.find_all(name = "dl")
    for topic in topics:
        link = topic.find('a').get("href")
        if link:
            links.append(link)
    print(f"page{page_number} completed")
    page_number += 1
    return links

The parsed links are further processed to access the comment pages.

In [7]:
def clean_links(links):
    cleaned_links = [f'https://mini.donanimhaber.com/{link}' for link in links if link.startswith('/')]
#     current_comment_page_url = f'https://mini.donanimhaber.com/{current_comment_page_url}'
    return cleaned_links

The website has a multi-page structure.Generate current page url from multipage structure

In [8]:
def generate_page_url(base_url, dataset_start_page, total_pages):
    page_urls = []
    for page_number in range(dataset_start_page,total_pages+1):
#         print(page_number,"...")
        page_url = f'{base_url}?sayfa={page_number}'
        page_urls.append(page_url)
    return page_urls

Comment pages has a multi-page structure.

In [9]:
def generate_comment_page_url(current_comment_page_url):
    comment_page_urls = []
    soup = html_code(current_comment_page_url)
    page_numbers = soup.find_all("option")
    
    if page_numbers:  
        page_number = int(page_numbers[-1]["value"])
    else:
        page_number = 1
    if page_number > max_comment_pages:
        page_number = max_comment_pages
    for i in range(1, page_number+1):
        comment_page_url = f'{current_comment_page_url}-{i}'
        comment_page_urls.append(comment_page_url)
    return comment_page_urls

Extract the reviews to train and test model

In [10]:
 def extract_comments(link):
#     comment_page_url = f'https://mini.donanimhaber.com/{link}' 

    soup = html_code(link)
    global comment_number
    comments = []
    comment_elements = soup.find_all(class_="comcom")
    for comment_element in comment_elements:
        comment = comment_element.find("td").getText()
        if comment:
            if not has_question_mark(comment):
                comments.append(comment)
                comment_number += 1
    return comments

Extract the reviews and their dates for analyse

In [11]:
def extract_comments_to_time(link):
    soup = html_code(link)
    global comment_number
    comments = []
    times = []
    comment_elements = soup.find_all(class_="comcom")
    dates = soup.find_all(class_ = "date")
    dates = [date.get_text() for date in dates]
    for index, comment_element in enumerate(comment_elements):
        comment = comment_element.find("td").getText()
        time_passed = is_time_passed(dates[index])
        if not time_passed and not has_question_mark(comment):
            comments.append(comment)
            times.append(dates[index].replace("\n",""))
            comment_number += 1
    return comments,times

We won't add reviews with question sentences to the dataset.Because they most likely won't determine sentiment.

In [12]:
def has_question_mark(text):
    return "?" in text

In [13]:
def clean_comments(comments):
    cleaned_comments = [comment.replace('\r\n', '') for comment in comments]
    return cleaned_comments

The analysis dataset and training dataset will be saved in different formats. Since we will do sentiment analysis according to time.

In [14]:
def save_to_csv(data, file):
    if isinstance(data, list):
        df = pd.DataFrame({'review': data})
        folder = "Dataset"
    elif isinstance(data, tuple):
        df = pd.DataFrame(data, columns=['review', 'date'])
#         df = pd.DataFrame({'review': comments, 'date': dates})
        folder = "Brands"
    else:
        raise ValueError("Invalid data format")
    
    filepath = Path('/'.join([folder, file]))
    filepath.parent.mkdir(parents=True, exist_ok=True) 
    
    if os.path.isfile(filepath):
        df.to_csv(filepath, mode='a', index=False, encoding='utf-8')
    else:
        df.to_csv(filepath, index=False, encoding='utf-8')
    

In [15]:
from pathlib import Path 

filepath = Path('dataset/out.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
# df.to_csv(filepath)  


# Create Datasets

Collect reviews for the purpose of ML model training and testing, and we won't need dates or labels as part of the collected data. Collection of comments only, without dates or labels for model training and testing.
There are no labels in the reviews on the site.

In [16]:
def create_dataset(base_url, dataset_start_page, total_pages ,file):
    page_urls = generate_page_url(base_url, dataset_start_page, total_pages)
    
    for page_url in page_urls:
        links = extract_links(page_url)
        links = clean_links(links)
        for link in links:
            comment_links = generate_comment_page_url(link)
            comment_links = clean_comments(comment_links)
            for comment_link in comment_links:
                comments = extract_comments(comment_link)
                comments = clean_comments(comments)
                save_to_csv(comments,file)
        print(comment_number)
        if comment_number > max_comments:
            print("Dataset completed!")
            break

Collect reviews of car brands and their dates

In [17]:
def scrape_analyse_data(base_url):
    brands = extract_brands(base_url)
    
    for brand in brands.keys():
        
        page_urls = generate_page_url_to_brand(brands[brand])
        
        for url in page_urls:
            
            links = extract_links_to_time(url)
            
            if not links:
                break
            else:
                links = clean_links(links)
                for link in links:
                    comment_links = generate_comment_page_url(link)
                    comment_links = clean_comments(comment_links)
                    for comment_link in reversed(comment_links):
                        comments,dates = extract_comments_to_time(comment_link)
                        if not comments:
                            break
                        else:
                            comments = clean_comments(comments)
                            save_to_csv(tuple(zip(comments, dates)),f'{brand}.csv')
        print(brand,"dataset completed")

In [18]:
page_number = 1

In [29]:
scrape_analyse_data(base_url)

page2 completed
page3 completed
page4 completed
page5 completed
page6 completed
page7 completed
page8 completed
Renault dataset completed
page9 completed
page10 completed
page11 completed
page12 completed
page13 completed
Opel dataset completed
page14 completed
page15 completed
page16 completed
page17 completed
page18 completed
Fiat dataset completed
page19 completed
page20 completed
page21 completed
page22 completed
Honda dataset completed
page23 completed
page24 completed
page25 completed
page26 completed
page27 completed
Toyota dataset completed
page28 completed
page29 completed
page30 completed
page31 completed
page32 completed
page33 completed
Citroen dataset completed
page34 completed
page35 completed
page36 completed
page37 completed
page38 completed
page39 completed
page40 completed
page41 completed
Vw dataset completed
page42 completed
page43 completed
page44 completed
page45 completed
page46 completed
page47 completed
Peugeot dataset completed
page48 completed
page49 complete

In [None]:
create_dataset(base_url, dataset_start_page, total_pages ,file)

53 sayfa + 87

Extract brand names and brand-specific forum links from the site

In [21]:
def extract_brands(base_url):
    soup = html_code(base_url)
    span_elements = soup.find_all('span', class_='subcat4')
    brands = {}
    for span in span_elements:
        link_element = span.find('a')
        if link_element:
            brand_name = link_element.get_text()
            brand_link = link_element['href']
            brand_link = f'https://mini.donanimhaber.com{brand_link}'
            brands[brand_name] = brand_link
    return brands

In [22]:
brands =extract_brands("https://mini.donanimhaber.com/markalar--f338")

In [23]:
for brand in brands.keys():
    print(brand)
    print(brands[brand])

Renault
https://mini.donanimhaber.com/renault--f339
Opel
https://mini.donanimhaber.com/opel--f340
Fiat
https://mini.donanimhaber.com/fiat--f341
Honda
https://mini.donanimhaber.com/honda--f342
Toyota
https://mini.donanimhaber.com/toyota--f343
Citroen
https://mini.donanimhaber.com/citroen--f344
Vw
https://mini.donanimhaber.com/vw--f345
Peugeot
https://mini.donanimhaber.com/peugeot--f346
Hyundai
https://mini.donanimhaber.com/hyundai--f347
Ford
https://mini.donanimhaber.com/ford--f348
Bmw
https://mini.donanimhaber.com/bmw--f438
Mercedes
https://mini.donanimhaber.com/mercedes--f439
Audi
https://mini.donanimhaber.com/audi--f440
Mazda
https://mini.donanimhaber.com/mazda--f441
Kia
https://mini.donanimhaber.com/kia--f442
Volvo
https://mini.donanimhaber.com/volvo--f496
Mitsubishi
https://mini.donanimhaber.com/mitsubishi--f537
Saab
https://mini.donanimhaber.com/saab--f576
Dacia
https://mini.donanimhaber.com/dacia--f577
Skoda
https://mini.donanimhaber.com/skoda--f578
Nissan
https://mini.donanimhab

In [24]:
def generate_page_url_to_brand(brand_link):
    page_urls = []
    soup = html_code(brand_link)
    
    page_numbers = soup.find_all("option")
    
    if page_numbers:  
        page_number = int(page_numbers[-1]["value"])
    else:
        page_number = 1
        
    for i in range(1, page_number+1):
        page_url = f'{brand_link}?sayfa={i}'
        page_urls.append(page_url)
    return page_urls
    

In [25]:
generate_page_url_to_brand('https://mini.donanimhaber.com/alfa-romeo--f582')

['https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=1',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=2',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=3',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=4',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=5',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=6',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=7',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=8',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=9',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=10',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=11',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=12',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=13',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=14',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=15',
 'https://mini.donanimhaber.com/alfa-romeo--f582?sayfa=16',
 'https://mini.donanimhaber.com/alfa-romeo--f582?

Comments from the last 11 months are used for analysis.

In [26]:
def extract_links_to_time(current_page_url):
    links = []
    global page_number
    time_passed = False    
    soup = html_code(current_page_url)
    topics = soup.find_all('a', style="line-height: 16px;")
    times = soup.find_all('span', style='margin-bottom: 5px;')
    for  topic in topics:
        link = topic.get("href")
        time = times[topics.index(topic)].find("a").getText()
        if "," in time:
            time = time.split(",")[1]
            time_passed = is_time_passed(time)
            if  not time_passed:
                links.append(link)
    print(f"page{page_number} completed")
    page_number += 1
    return links

Control last 6 months

In [27]:
def is_time_passed(time):
    if len(time.split()) >= 2:
        num, unit = time.split()[0], time.split()[1]
        if unit == "yıl":
            time_passed = True
        elif num =="geçen":
            time_passed = False
        elif unit =="ay" and int(num) >= 11:
            time_passed = True
        else:
            time_passed = False
    else:
        time_passed = False  
    return time_passed

In [28]:
extract_links_to_time("https://mini.donanimhaber.com/renault--f339?sayfa=1")

page1 completed


['/u-y-a-r-i--128253373',
 '/renault-kardian-suv-un-tasarimina-iliskin-yeni-gorseller-paylasildi--157005303',
 '/2020-renault-clio-5-ana-konu--136922052',
 '/captur-2-2020--143329326',
 '/2016-renault-talisman-ana-konu--107705078',
 '/renault-larin-teyp-kodu-nasil-girilir--5913162',
 '/motor-takozu-degisti-rolantide-sarsinti-artti-bir-bilen--18387808',
 '/yeni-renault-kadjar-ana-konu--102295469',
 '/yeni-megane-e-tech-yuzde-100-elektrikli-van-da-duzenlenen-etkinlikle-tanitildi--156893544',
 '/yeni-renault-clio-turkiye-de-iste-fiyati-ve-ozellikleri--156892736',
 '/elektrikli-renault-zoe-ana-konu--155319687',
 '/laguna-ii-grubu-gelin-konusalim-tartisalim-paylasalim--35550700',
 '/renault-austral-e-tech-full-hybrid-turkiye-de-iste-fiyati-ve-ozellikleri--156559932',
 '/0-9-tce-symbol-benzinle-calismiyor--156267020',
 '/renault-latitude-kulubu--48906332',
 '/renault-fluence-1-5dci-ecu-arizasi--156828671']