# ChatGPT'nin optimize ettigi path ve file yazan kod

Scrape yaptiktan sonra output'larin oldugu path'i en son output_folder seklindeydi, bunu ortak sabit bir yere almak gerek.

In [47]:
import re
import os

def extract_and_write_data(directory_path, output_file):
    pattern = r"\.\.\.Telekom.*?\s{2}"
    total_files = 0
    processed_files = 0
    unprocessed_files = []

    def process_file(filename):
        nonlocal processed_files
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as source:
                content = source.read()
                match = re.search(pattern, content, re.DOTALL)
                if match:
                    processed_files += 1
                    matched_text = match.group(0)
                    cleaned_text = matched_text.strip(".").strip()
                    sections = re.split(r'\.\.\.\.+', cleaned_text)
                    sections = [section.strip() for section in sections if section.strip()]
                    if sections and sections[-1].endswith("..."):
                        sections[-1] = sections[-1].rstrip(".")
                    return filename, sections
                else:
                    return filename, None
        except Exception as e:
            return filename, str(e)

    with open(output_file, "w", encoding="utf-8") as output:
        for filename in os.listdir(directory_path):
            if filename.endswith(".txt"):
                total_files += 1
                result = process_file(filename)
                if isinstance(result[1], str):
                    unprocessed_files.append(f"{result[0]}: Error - {result[1]}")
                elif result[1] is None:
                    unprocessed_files.append(f"{result[0]}: No matching content found.")
                else:
                    output.write(f"File: {result[0]}\n")
                    for section in result[1]:
                        output.write(f"{section}\n")
                    output.write("\n" + "="*40 + "\n")

        output.write(f"\nTotal number of .txt files in the folder: {total_files}\n")
        output.write(f"Number of .txt files processed and written to data.txt: {processed_files}\n")
        if unprocessed_files:
            output.write("\nUnprocessed files:\n")
            for entry in unprocessed_files:
                output.write(f"{entry}\n")

    print("Process completed. The extracted sections from all text files have been written to data.txt.")
    print(f"Total number of .txt files in the folder: {total_files}")
    print(f"Number of .txt files processed and written to data.txt: {processed_files}")
    print(f"Unprocessed files and reasons have been written to data.txt.")

# Kullanım
directory_path = "/Users/taha/Desktop/scrapeV2/output_folder"
output_file = "data.txt"
extract_and_write_data(directory_path, output_file)


Process completed. The extracted sections from all text files have been written to data.txt.
Total number of .txt files in the folder: 2595
Number of .txt files processed and written to data.txt: 1676
Unprocessed files and reasons have been written to data.txt.


# data.txt ve data.csv - navigation soru ve cevap - ayiklanmadi ama

In [20]:
import re
import os
import csv

def extract_and_write_data(directory_path, output_file_txt, output_file_csv):
    # Regex pattern: Start with "...Telekom" and end with two spaces
    pattern = r"\.\.\.Telekom.*?\s{2}"
    
    # Initialize counters and lists
    total_files = 0
    processed_files = 0
    unprocessed_files = []
    processed_files_data = []

    def process_file(filename):
        nonlocal processed_files
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as source:
                content = source.read()
                match = re.search(pattern, content, re.DOTALL)
                if match:
                    processed_files += 1
                    matched_text = match.group(0)
                    start_index = match.end()

                    # Clean and format the matched text
                    # Remove leading and trailing dots and split by dots
                    cleaned_text = re.sub(r'\.\.\.+', '\n', matched_text.strip(".").strip())
                    
                    # Extract text after the pattern
                    post_pattern_text = content[start_index:]
                    
                    # Regex pattern to find paragraphs separated by multiple newlines
                    paragraph_pattern = r'([^\n]+(?:\n[^\n]+)*)(?:\n{2,})'
                    
                    paragraphs = re.findall(paragraph_pattern, post_pattern_text)
                    
                    if len(paragraphs) >= 2:
                        # Extract the first and second paragraphs
                        question = paragraphs[0].strip()
                        answer = paragraphs[1].strip()
                        
                        # Check for excessive newlines after the second paragraph
                        if len(paragraphs) > 2:
                            # Find the number of newlines after the second paragraph
                            remaining_text = post_pattern_text[post_pattern_text.find(paragraphs[1]) + len(paragraphs[1]):]
                            newlines_after_second = re.match(r'\n{5,}', remaining_text)
                            if newlines_after_second:
                                processed_files_data.append((filename, cleaned_text, question, answer))
                        else:
                            processed_files_data.append((filename, cleaned_text, question, answer))
                else:
                    unprocessed_files.append(f"{filename}: No matching content found.")
        except Exception as e:
            unprocessed_files.append(f"{filename}: Error - {str(e)}")

    # Process all files
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            total_files += 1
            process_file(filename)

    # Sort processed and unprocessed files alphabetically
    processed_files_data.sort(key=lambda x: x[0])
    unprocessed_files.sort()

    # Write to both TXT and CSV files
    with open(output_file_txt, "w", encoding="utf-8") as output_txt, \
         open(output_file_csv, "w", newline='', encoding="utf-8") as output_csv:
        
        csv_writer = csv.writer(output_csv)
        csv_writer.writerow(["File", "Navigation", "Question", "Answer"])
        
        # Write processed files to TXT and CSV
        for filename, navigation, question, answer in processed_files_data:
            # Write to TXT file
            output_txt.write(f"File: {filename}\n")
            output_txt.write(f"Navigation:\n{navigation}\n\n")
            output_txt.write(f"Question:\n{question}\n\n")
            output_txt.write(f"Answer:\n{answer}\n")
            output_txt.write("\n" + "="*40 + "\n")
            
            # Write to CSV file
            csv_writer.writerow([filename, navigation.replace('\n', ' '), question.replace('\n', ' '), answer.replace('\n', ' ')])

        # Write summary information to TXT file
        output_txt.write(f"\nTotal number of .txt files in the folder: {total_files}\n")
        output_txt.write(f"Number of .txt files processed and written to data.txt: {processed_files}\n")
        
        # Write unprocessed files in alphabetical order
        if unprocessed_files:
            output_txt.write("\nUnprocessed files:\n")
            for entry in unprocessed_files:
                output_txt.write(f"{entry}\n")

    print("Process completed. The extracted sections from all text files have been written to data.txt and data.csv.")
    print(f"Total number of .txt files in the folder: {total_files}")
    print(f"Number of .txt files processed and written to data.txt: {processed_files}")
    print(f"Unprocessed files and reasons have been written to data.txt.")

# Kullanım
directory_path = "/Users/taha/Desktop/scrape_telekom_website/output_folder"
output_file_txt = "data.txt"
output_file_csv = "data.csv"
extract_and_write_data(directory_path, output_file_txt, output_file_csv)


Process completed. The extracted sections from all text files have been written to data.txt and data.csv.
Total number of .txt files in the folder: 2595
Number of .txt files processed and written to data.txt: 1676
Unprocessed files and reasons have been written to data.txt.


# 3 farkli dosyaya aliyorum, qa.txt...

In [28]:
import re
import os
import csv

def extract_and_write_data(directory_path, output_file_txt, output_file_csv):
    # Regex pattern: Start with "...Telekom" and end with two spaces
    pattern = r"\.\.\.Telekom.*?\s{2}"
    
    # Initialize counters and lists
    total_files = 0
    processed_files = 0
    unprocessed_files = []
    processed_files_data = []

    def process_file(filename):
        nonlocal processed_files
        file_path = os.path.join(directory_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as source:
                content = source.read()
                match = re.search(pattern, content, re.DOTALL)
                if match:
                    processed_files += 1
                    matched_text = match.group(0)
                    start_index = match.end()

                    # Clean and format the matched text
                    # Remove leading and trailing dots and split by dots
                    cleaned_text = re.sub(r'\.\.\.+', '\n', matched_text.strip(".").strip())
                    
                    # Extract text after the pattern
                    post_pattern_text = content[start_index:]
                    
                    # Regex pattern to find paragraphs separated by multiple newlines
                    paragraph_pattern = r'([^\n]+(?:\n[^\n]+)*)(?:\n{2,})'
                    
                    paragraphs = re.findall(paragraph_pattern, post_pattern_text)
                    
                    if len(paragraphs) >= 2:
                        # Extract the first and second paragraphs
                        question = paragraphs[0].strip()
                        answer = paragraphs[1].strip()
                        
                        # Check for excessive newlines after the second paragraph
                        if len(paragraphs) > 2:
                            # Find the number of newlines after the second paragraph
                            remaining_text = post_pattern_text[post_pattern_text.find(paragraphs[1]) + len(paragraphs[1]):]
                            newlines_after_second = re.match(r'\n{5,}', remaining_text)
                            if newlines_after_second:
                                processed_files_data.append((filename, cleaned_text, question, answer))
                        else:
                            processed_files_data.append((filename, cleaned_text, question, answer))
                else:
                    unprocessed_files.append(f"{filename}: No matching content found.")
        except Exception as e:
            unprocessed_files.append(f"{filename}: Error - {str(e)}")

    # Process all files
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            total_files += 1
            process_file(filename)

    # Sort processed and unprocessed files alphabetically
    processed_files_data.sort(key=lambda x: x[0])
    unprocessed_files.sort()

    # Write to both TXT and CSV files
    with open(output_file_txt, "w", encoding="utf-8") as output_txt, \
         open(output_file_csv, "w", newline='', encoding="utf-8") as output_csv:
        
        csv_writer = csv.writer(output_csv)
        csv_writer.writerow(["File", "Navigation", "Question", "Answer"])
        
        # Write processed files to TXT and CSV
        for filename, navigation, question, answer in processed_files_data:
            # Write to TXT file
            output_txt.write(f"File: {filename}\n")
            output_txt.write(f"Navigation:\n{navigation}\n\n")
            output_txt.write(f"Question: {question}\n\n")
            output_txt.write(f"Answer: {answer}\n")
            output_txt.write("\n" + "="*40 + "\n")
            
            # Write to CSV file
            csv_writer.writerow([filename, navigation.replace('\n', ' '), question.replace('\n', ' '), answer.replace('\n', ' ')])

        # Write summary information to TXT file
        output_txt.write(f"\nTotal number of .txt files in the folder: {total_files}\n")
        output_txt.write(f"Number of .txt files processed and written to data.txt: {processed_files}\n")
        
        # Write unprocessed files in alphabetical order
        if unprocessed_files:
            output_txt.write("\nUnprocessed files:\n")
            for entry in unprocessed_files:
                output_txt.write(f"{entry}\n")

    print("Process completed. The extracted sections from all text files have been written to data.txt and data.csv.")
    print(f"Total number of .txt files in the folder: {total_files}")
    print(f"Number of .txt files processed and written to data.txt: {processed_files}")
    print(f"Unprocessed files and reasons have been written to data.txt.")

# Kullanım
directory_path = "/Users/taha/Desktop/scrape_telekom_website/output_folder"
output_file_txt = "data.txt"
output_file_csv = "data.csv"
extract_and_write_data(directory_path, output_file_txt, output_file_csv)


Process completed. The extracted sections from all text files have been written to data.txt and data.csv.
Total number of .txt files in the folder: 2595
Number of .txt files processed and written to data.txt: 1676
Unprocessed files and reasons have been written to data.txt.


# 3 farkli txt dosyasina al

In [30]:
import re

def filter_data_files(data_txt_file, qa_file, uncategorised_file, unidentified_file):
    # Regular expressions for finding sections
    file_pattern = re.compile(r"File:\s*(.*?)\n")
    navigation_pattern = re.compile(r"Navigation:\n(.*?)\n\n", re.DOTALL)
    question_pattern = re.compile(r"Question:\s*(.*?)\n\n", re.DOTALL)
    answer_pattern = re.compile(r"Answer:\s*(.*?)\n", re.DOTALL)
    
    # Lists to store results
    qa_data = []
    uncategorised_data = []
    unidentified_data = []

    # Read and process the data.txt file
    with open(data_txt_file, "r", encoding="utf-8") as file:
        content = file.read()
        
        # Split content into blocks based on separator lines
        blocks = content.split("\n" + "="*40 + "\n")

        for block in blocks:
            # Match different sections in the block
            file_match = file_pattern.search(block)
            navigation_match = navigation_pattern.search(block)
            question_match = question_pattern.search(block)
            answer_match = answer_pattern.search(block)
            
            if file_match and navigation_match and question_match and answer_match:
                filename = file_match.group(1).strip()
                navigation = navigation_match.group(1).strip()
                question = question_match.group(1).strip()
                answer = answer_match.group(1).strip()
                
                # Check if the question ends with a question mark
                if question.endswith('?'):
                    qa_data.append(f"File: {filename}\nNavigation:\n{navigation}\n\nQuestion: {question}\n\nAnswer: {answer}\n\n")
                else:
                    uncategorised_data.append(f"File: {filename}\nNavigation:\n{navigation}\n\nQuestion: {question}\n\nAnswer: {answer}\n\n")
            else:
                # If any part of the block is missing, consider it unidentified
                unidentified_data.append(block.strip())
    
    # Write filtered data to respective files
    with open(qa_file, "w", encoding="utf-8") as file:
        file.writelines(qa_data)

    with open(uncategorised_file, "w", encoding="utf-8") as file:
        file.writelines(uncategorised_data)
    
    with open(unidentified_file, "w", encoding="utf-8") as file:
        file.writelines(unidentified_data)

    # Print the number of blocks processed as questions
    print(f"Number of blocks processed as questions: {len(qa_data)}")

    print("Filtering completed. Data has been written to qa.txt, uncategorised.txt, and unidentified.txt.")

# Kullanım
data_txt_file = "data.txt"
qa_file = "qa.txt"
uncategorised_file = "uncategorised.txt"
unidentified_file = "unidentified.txt"
filter_data_files(data_txt_file, qa_file, uncategorised_file, unidentified_file)


Number of blocks processed as questions: 1036
Filtering completed. Data has been written to qa.txt, uncategorised.txt, and unidentified.txt.
