# <b><h1 style="text-align:center;">Data Extraction</h1></b>

![Data Extraction](data_extraction.png) 


In [1]:
from pathlib import Path
import pandas as pd
def read_text_file(file_path: Path) -> str:
    """Read and return the content of a text file."""
    try:
        return file_path.read_text(encoding="utf-8")
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

def read_directory_structure(base_dir: str) -> dict:
    """
    Read all files inside subdirectories of the given base directory.
    Return a dictionary with subdirectory names as keys and list of file contents as values.
    """
    data = {}
    base_path = Path(base_dir)

    for subfolder in base_path.iterdir():
        if subfolder.is_dir():
            print(f"Reading folder: {subfolder.name}")
            data[subfolder.name] = []

            for file in subfolder.iterdir():
                if file.is_file():
                    print(f"Reading file: {file.name}")
                    file_content = read_text_file(file)
                    data[subfolder.name].append(file_content)

    return data


In [2]:
directory_data = read_directory_structure("documents")


Reading folder: business
Reading file: business_1.txt
Reading file: business_10.txt
Reading file: business_100.txt
Reading file: business_11.txt
Reading file: business_12.txt
Reading file: business_13.txt
Reading file: business_14.txt
Reading file: business_15.txt
Reading file: business_16.txt
Reading file: business_17.txt
Reading file: business_18.txt
Reading file: business_19.txt
Reading file: business_2.txt
Reading file: business_20.txt
Reading file: business_21.txt
Reading file: business_22.txt
Reading file: business_23.txt
Reading file: business_24.txt
Reading file: business_25.txt
Reading file: business_26.txt
Reading file: business_27.txt
Reading file: business_28.txt
Reading file: business_29.txt
Reading file: business_3.txt
Reading file: business_30.txt
Reading file: business_31.txt
Reading file: business_32.txt
Reading file: business_33.txt
Reading file: business_34.txt
Reading file: business_35.txt
Reading file: business_36.txt
Reading file: business_37.txt
Reading file: bus

In [3]:


def convert_dict_to_dataframe(data_dict: dict) -> pd.DataFrame:
    """
    Convert a dictionary of the form {class_name: [text1, text2, ...]} 
    into a pandas DataFrame with columns 'class' and 'text'.
    """
    records = []

    for class_name, texts in data_dict.items():
        for text in texts:
            records.append({
                "class_name": class_name,
                "text": text
            })

    df = pd.DataFrame(records)
    return df


In [4]:
data= convert_dict_to_dataframe(directory_data)

In [5]:
data.class_name.value_counts()

class_name
business         100
entertainment    100
food             100
graphics         100
historical       100
medical          100
politics         100
space            100
sport            100
technologie      100
Name: count, dtype: int64

In [6]:
data.to_csv("data.csv",index=False)