In [None]:
from blob import blob_analyse_text
from transformer import Transformer
from lexicon import lexicon_analyse_text
import os
from PyPDF2 import PdfReader
import numpy as np
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

validate the pdfs 

In [None]:
import os
from PyPDF2 import PdfReader

def read_pdfs_in_folder_and_delete_faulty(folder_path):
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            try:
                pdf_path = os.path.join(folder_path, file)
                with open(pdf_path, 'rb') as pdf_file:
                    pdf_reader = PdfReader(pdf_file)
            #    print(f"Successfully read {file}")
            except Exception as e:
                print(f"Failed to read {file}: {e}")
                try:
                    os.remove(pdf_path)
                    print(f"Deleted faulty file: {file}")
                except Exception as remove_error:
                    print(f"Failed to delete faulty file {file}: {remove_error}")

# Example usage:
folder_path = "./dataset2/ugly2"
read_pdfs_in_folder_and_delete_faulty(folder_path)

In [None]:
class Processor():
    def __init__(self, sentiment_processor) -> None:
        if sentiment_processor== "lexicon":
            self.sentiment_processor= lexicon_analyse_text
            self.multiprocess=True
        elif sentiment_processor== "transformer":
            transformer=Transformer(model="distilbert")
            self.sentiment_processor= transformer.analyse_text
            self.multiprocess=False
        elif sentiment_processor== "textBlob":
            self.sentiment_processor= blob_analyse_text
            self.multiprocess=True
        else:
            raise(Exception("choice an available processor"))

    def read_file_text(self, file_path):
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == ".pdf":
            with open(file_path, "rb") as pdf_file:
                pdf_reader = PdfReader(pdf_file)
                text = " ".join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])
        elif file_extension == ".txt":
            with open(file_path, "r") as txt_file:
                text = txt_file.read()
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")

        return text

    def moving_average(self, timeseries, window_len):
        cumsum = np.cumsum(np.insert(timeseries, 0, 0)) 
        moving_avg = (cumsum[window_len:] - cumsum[:-window_len]) / float(window_len)
        return moving_avg

    def process_file(self, file_path, avg_percent=0.01):
        raw_text = self.read_file_text(file_path)
        avg_window_len= int(len(raw_text) * avg_percent)

        timeseries = self.sentiment_processor(raw_text)
        timeseries_avg = self.moving_average(timeseries, avg_window_len)
        return timeseries_avg

    def process(self, folder_path):
        files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith((".pdf", ".txt"))]
        timeseries_list = []

        print(files[:10])

        if(self.multiprocess):
            with ProcessPoolExecutor() as executor:
                results = executor.map(self.process_file, files)

            for result in results:
                timeseries_list.append(result)
        else:
            for file in files:
                result= self.process_file(file)
                timeseries_list.append(result)
                
        return timeseries_list

In [None]:
good_path = "./dataset2/good"
ugly_path = "./dataset2/ugly"
min_lenght=40000
average_len = 1000  # The window length for moving average
window_len=20

processor = Processor("textBlob")

#good_timeseries = processor.process(good_path )
ugly_timeseries = processor.process(ugly_path )

In [None]:
import numpy as np
from scipy.signal import resample
import matplotlib.pyplot as plt
def resample_lists_to_arrays(list_of_lists, target_length):
    resampled_arrays = []

    for sublist in list_of_lists:
        # Convert sublist to numpy array
        original_array = np.array(sublist)

        # Resample the array to the target length
        resampled_array = resample(original_array, target_length)

        # Add the resampled array to the output list
        resampled_arrays.append(resampled_array)

    return resampled_arrays


def plot_first_n_examples(timeseries_data, n=2):
    for i in range(n):
        plt.figure()
        plt.plot(np.arange(0, len(timeseries_data[i])), timeseries_data[i], label=f'emotional arcs novel={i})')
        plt.title(f"Example {i + 1}")
        plt.xlabel("Time")
        plt.ylabel("Emotional Score")
        plt.show()

target_length = 5000
#good_resampled = resample_lists_to_arrays(good_timeseries, target_length)
ugly_resampled = resample_lists_to_arrays(ugly_timeseries, target_length)
plot_first_n_examples(ugly_resampled)

In [None]:
good= np.stack( ugly_resampled, axis=0 )
#ugly=np.stack( ugly_resampled, axis=0)

with open('./dataset/dataset_ugly_2.npy', 'wb') as f:
    np.save(f, good)
   # np.save(f, ugly)

In [None]:
with open('./dataset/dataset_ugly_1.npy', 'rb') as f:
    good2 = np.load(f)
   # ugly1 = np.load(f)

In [None]:
l,_ = good.shape
print(np.all(good == good2))
      

In [None]:
print(good.shape)