In [1]:
import gzip
import io
import os
import re
import math
import time
import concurrent.futures

import sys
import import_ipynb

from typing import TextIO, BinaryIO
from typing import List,Iterator

sys.path.append('../')

from pre_processing.TextProcessor import TextProcessor

importing Jupyter notebook from C:\Users\Davide\IR\Progetto\pre_processing\..\pre_processing\TextProcessor.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\pre_processing\..\utilities\General_Utilities.ipynb


In [2]:
"""
    This class is used to avoid to load the entire data collection in memory and process a row at a time.
    In particular it is designed to load just a partial portion of the entire collection, then uncompress it
    and applying the pre-processing tecniques.
    It also gives the possibility to run a test-collection, all in complete transparency of the program who uses it.

"""

class Collection_Reader:
    
    __collection_file_name:str
         
    __escape_first_row_description:bool
    __use_steamming_and_remove_stop_words:bool    
        
    __num_parallel_processes:int
    __max_nr_of_documents_in_memory:int
  
    __text_processor:TextProcessor
    __collection_file: BinaryIO
        
    __documents:Iterator[str]
        
    __test_mode:bool
    
    def __init__(self,path_collection_file_name:str,
                 max_nr_of_documents_in_memory:int,
                 num_parallel_processes:int,
                 use_steamming_and_remove_stop_words:bool,
                 escape_first_row_description:bool,
                 collection_test:List[str]=[]):    
        """ Constructor methods for initialization:

            Args:
                path_collection_file_name: the file location of the collection to be read.
                max_nr_of_documents_in_memory: specifies the number of maxium document to be load in main memory. 
                num_parallel_processes: specifies the number of parallel process to be used during execution of pre-processing stage, in order to speed up the elaboration
                use_steamming_and_remove_stop_words: a boolean indicating if using steamming_and_remove_stop_words
                escape_first_row_description: a boolean indicating if the first line of document should be treat differently
                collection_test: (optional) if present uses the collection indicated instead of the one indicated in path_collection_file_name
            
        """
        self.__text_processor = TextProcessor(use_steamming_and_remove_stop_words)
        
        if (collection_test):
            self.__test_mode=True
            self.__num_parallel_processes=1
            
            doc_pre_processed=[]
            for doc in collection_test:
                doc_pre_processed.append(self.__text_processor.process_text(doc))
            
            self.__documents=iter(doc_pre_processed)
            
        else:
            self.__test_mode=False
            
            self.__collection_file_name = path_collection_file_name
            self.__max_nr_of_documents_in_memory = max_nr_of_documents_in_memory
            self.__num_parallel_processes=num_parallel_processes
            self.__use_steamming_and_remove_stop_words=use_steamming_and_remove_stop_words

            if (self.__max_nr_of_documents_in_memory<=0):
                raise ValueError("Please enter a nr of documents >=1")
            
            if (self.__num_parallel_processes<=0):
                raise ValueError("Please enter a nr of parallel processes >=1")
            

            self.__text_processor = TextProcessor(use_steamming_and_remove_stop_words)
            self.__collection_file= gzip.open(self.__collection_file_name, 'rt', encoding='utf-8')
            self.__escape_first_row_description=escape_first_row_description

            if (escape_first_row_description):
                line = self.__collection_file.readline()
                result_string = self.__text_processor.process_text("0 "+re.sub(r".*?0\t", "", line))
                print(result_string)
                self.__documents=iter([result_string])
            else:
                self.__documents=iter([])
                
                
        print ("Collection_Reader Costructor")
        
        print ("Using: ")
        if (collection_test):
            print("Testing Mode : True")
            print("No. of documents in the test collection: "+str(len(collection_test)))
        else:
            print("Testing Mode: False ")
            print("Max Document in memory: "+str(self.__max_nr_of_documents_in_memory))
            print("No. of parallel processes: "+str(self.__num_parallel_processes))
            print("Use Stemming and stop word removal: "+str(self.__use_steamming_and_remove_stop_words))
        
        if (self.__num_parallel_processes>=2):
            print("No. of parallel processes>=2 be sure to executing  this program outside a jupyter notebook.")
        else:
            print("No. of parallel processes=1, you can execute it also inside a jupyter notebook.")
        print("\n")
        
            
            
    def __close_collection_file(self):
        self.__collection_file.close()
      
    def __iter__(self):
        return self
    
    
    def __next__(self):
        if (self.__test_mode):
            return next(self.__documents)
        else: 
            current_document=next(self.__documents,None)
            if (current_document==None):
                current_document=self.__read_part_of_collection()
                if (current_document==None):
                    raise StopIteration()
            return current_document
            
            
    def __read_part_of_collection(self):
        
        
        buffer=[]
        array_buffers=[]
        nr_doc=0
        max_doc_per_buffer=math.ceil(self.__max_nr_of_documents_in_memory/self.__num_parallel_processes)
        
        start_time_loop = time.time()
        while(True):
           
            line = self.__collection_file.readline()
            if (line=="" or nr_doc==self.__max_nr_of_documents_in_memory):
                
                if (line==""):
                    self.__close_collection_file()
                    print("End reading of entire collection!") 
                else:
                    array_buffers.append(buffer)
                break
            
            buffer.append(line)
            nr_doc+=1 
            
            if (nr_doc%(max_doc_per_buffer)==0):
                array_buffers.append(buffer)
                buffer=[]
                
        if (nr_doc!=0):
            
            #If execute NOT inside a jupyter notebook
            if (self.__num_parallel_processes>=2):
                with concurrent.futures.ProcessPoolExecutor() as executor:
                    # Submit processing tasks for each line

                    #print("Prima submit executor")
                    futures = {executor.submit(execute_CPU_BOUND_preprocessing, i,buffer,self.__use_steamming_and_remove_stop_words): i for i,buffer in enumerate(array_buffers)}
                    #print("Dopo submit executor")

                    try:
                        # Wait for all tasks to complete
                        concurrent.futures.wait(futures)

                        return_list=[]
                        for future in futures:
                            result = future.result()
                            return_list.extend(result)

                        self.__documents=iter(return_list)
                        array_buffers.clear()
                    except Exception as e:  
                        #print ("CATCH ESTERNO")
                        print(e)
            else:
                #If execute inside a jupyter notebook, costrained to one single process.
            
                pre_processed_docs=execute_CPU_BOUND_preprocessing(0,array_buffers[0],self.__use_steamming_and_remove_stop_words)
                self.__documents=iter(pre_processed_docs)
         
        end_part_loop = time.time()
        
        print("Doc_processed: "+str(nr_doc)+" time:"+str(end_part_loop-start_time_loop))
        
        return next(self.__documents,None)

In [3]:
def execute_CPU_BOUND_preprocessing(index,buffer,use_steamming_and_remove_stop_words):
        
    try:

        print ("START execute_CPU_BOUND_preprocessing "+str(index))
        array_return=[]
        tp= TextProcessor(use_steamming_and_remove_stop_words)
        #print("\n\n")
        #print(str_buffer)
        #pre_processed=self.__text_processor.process_text(str_buffer)
        for line in buffer:
            pre_processed=tp.process_text(line)
            if (pre_processed.strip()!=""):
                array_return.append(pre_processed)        				
        print ("indice "+str(index)+" lunghezza array (indice)"+str(index)+" "+str(len(array_return)))
        #if (index==4):
        #    for index2,elem in enumerate(array_return):
        #        print(index2, elem)

    except Exception as e:  
        print ("entro dentro il catch")
        print(e)

    finally:
        print ("END execute_CPU_BOUND_preprocessing "+str(index))

    return array_return

In [7]:
#collection=Collection_Reader("C:/Users/Davide/IR/collection.tar.gz",10,1,False,True)

0 the presence of communication amid scientific minds was equally important to the success of the manhattan project as scientific intellect was the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant hundreds of thousands of innocent lives obliterated


In [8]:
# arrayDef=[]
# for doc in collection:
#     arrayDef.append(doc)
#     print(doc)

0 the presence of communication amid scientific minds was equally important to the success of the manhattan project as scientific intellect was the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant hundreds of thousands of innocent lives obliterated
entro dentro la read_collection del disco
creo altro buffer10
Executing inside a jupyter notebook
START execute_CPU_BOUND_preprocessing 0
indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.001995086669921875
1 the manhattan project and its atomic bomb helped bring an end to world war ii its legacy of peaceful uses of atomic energy continues to have an impact on history and science
2 essay on the manhattan project the manhattan project the manhattan project was to see if making an atomic bomb possible the success of this project would forever change the world forever making it known that something this powerful can be

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.005984067916870117
2311 in a report published in the new england journal of medicine researchers found that the brains of autistic children showed differences in certain regions that normally develop in the second trimester of pregnancy
2312 after that your appetite should return and will probably grow although food is looking much more appetizing be aware of how much you re eating you only need about an extra 300 to 500 calories a day during the second trimester and you should be gaining about 1 2 to 1 pound a week
2313 pregnancy is typically broken into three periods or trimesters each of about three months each trimester is defined as 14 weeks for a total duration of 42 weeks although the average duration of pregnancy is 40 weeks
2314 the second trimester is for many women the easiest three months of pregnancy take the time now while you re feeling better and your energy is up to st

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004988431930541992
4401 your screen will flash for a sec and that s the sign that the screenshot has been taken method 2 using keyboard if you are using a keyboard then you can simple press the windows key printscreen button combination to take screenshots your screen will flash for a sec which means that the screenshot has been taken method 3 using the snipping tool f you are using a keyboard then you can simple press the windows key printscreen button combination to take screenshots your screen will flash for a sec which means that the screenshot has been taken method 3 using the snipping tool
4402 how to take a screenshot there are a few ways to do this you can hold down the windows button on the surface pro 3 s bezel and the volume down button together until you see the screen blink to indicate that the shot was captured if you have a keyboard attached then hold down fn win space b

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.005984306335449219
6734 1 roast beef for 5 minutes per pound 2 then turn off oven 3 do not open oven for 2 hours for example a 4 lb roast would take 20 minutes to cook and then sit in the turned off oven for 2 hours 5 well the 5lb 2oz eye of round roast went in the oven for 30 minutes at 500 degrees after the two hour wait the oven temp and the internal temp of the meat were both about 153 a little more done than i would have liked but still very tasty and juicy
6735 place two teaspoons of oil olive or salad into the pot and add the roast heat it to a medium high temperature 4 add the other ingredients to the pot once the meat has browned add 2 1 2 cups 567ml of water two beef bouillon cubes and a bay leaf to the dutch oven until it boils when the pot reaches a boil cover the dutch oven with a lid and simmer the eye round roast for 50 minutes over a low flame lace two teaspoons of oil 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004986763000488281
34046 if loving you is wrong tv show cast if loving you is wrong to debut this fall premiering tuesday september 9 the new own drama series from tyler perry takes viewers into the lives of a group of husbands wives and friends that live and love in the same middle class neighborhood read more f loving you is wrong tv show summary pictures news cast forum if loving you is wrong tv show formerly known as single mom s club is a drama series on own from prolific writer director and producer tyler perry
34047 exclusive own oprah winfrey network has assembled the principal cast of the latest tyler perry drama series the upcoming if loving you is wrong formerly single mom s club f loving you is wrong produced by own and tyler perry studios is perry s second drama series for own under his production deal with the network
34048 by now viewers who have seen the first two episo

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.00498509407043457
36389 edward everett the featured speaker at the dedication ceremony of the national cemetery of gettysburg later wrote to lincoln i wish that i could flatter myself that i had come as near to the central idea of the occasion in two hours as you did in two minutes
36390 lincoln s gettysburg address on this day in 1863 president abraham lincoln delivers what will become one of the most famous speeches in american history at the dedication of the military cemetery at gettysburg pennsylvania
36391 gettysburg address lincoln s preparation though lincoln was extremely frustrated with meade and the army of the potomac for failing to pursue lee s forces in their retreat he was cautiously optimistic as the year 1863 drew to a close
36392 lincoln s gettysburg address on this day in 1863 president abraham lincoln delivers what will become one of the most famous speeches in amer

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004986763000488281
38622 many species of birds are economically important domesticated and undomesticated birds poultry and game are important sources of eggs meat and feathers songbirds parrots and other species are popular as pets guano bird excrement is harvested for use as a fertilizer birds prominently figure throughout human culture about 120 130 species have become extinct due to human activity since the 17th century and hundreds more before then
38623 the partially digested food then passes into the gizzard a specialized muscular portion of the stomach a bird will use its gizzard in the manner that other animals use their teeth to grind and crush hard nuts seeds grain and other foods birds do this because they don t have teeth
38624 birds live worldwide and range in size from the 5 cm 2 in bee hummingbird to the 2 75 m 9 ft ostrich they rank as the class of tetrapods with the m

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004988193511962891
40987 the first base 10 system one of the first base 10 systems was used in egypt back in 3100 bc this lays the foundation for our current number system today because we use in everyday life a base 10 number system
40988 confidence votes 1 2k the indians invented the modern number system it is often called arabic numerals because it came to europe through the arabs but arabs themselves call it as hindsaa meaning given by hindus or indians the persians copied the indian number system and then passed it on to the arabs
40989 zero was invented independently by the babylonians mayans and indians although some researchers say the indian number system was influenced by the babylonians the babylonians got their number system from the sumerians the first people in the world to develop a counting system
40990 another simple number system was the ordinal counting system it is 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004986763000488281
68817 traditionally a wiener schnitzel is a breaded veal cutlet served with a lemon wedge and a green salad my viennese mother taught me a much tastier and juicier version of this classic austrian recipe a lightly pounded slice of pork loin coated with seasoned flour egg wash and bread crumbs this post was mentioned on twitter by chef keem chef keem chef keem said wiener schnitzel jaeger schnitzel the difference between the two and a tasty breading recipe to boot
68818 wiener schnitzel vi n n ts l german for viennese schnitzel is a very thin breaded and pan fried cutlet made from veal it is one of the best known specialities of viennese cuisine the wiener schnitzel is the national dish of austria he designation wiener schnitzel first appeared in the end of the 19th century with the first known mention in a cookbook from 1831 in the popular southern german cookbook by

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.00498509407043457
71127 the structure and functions of the federal reserve system the federal reserve system is the central bank of the united states it was founded by congress in 1913 to provide the nation with a safer more flexible and more stable monetary and financial system over the years its role in banking and the economy has expanded test your knowledge about the federal reserve through these quizzes
71128 the board s most important responsibility is participating in the federal open market committee fomc which conducts our nation s monetary policy the seven governors comprise the voting majority of the fomc with the other five votes coming from reserve bank presidents
71129 the fed is an independent agency which means it can make decisions on its own without needing approval from any other branch of government however it is subject to questions from congress over its actions t

creo altro buffer10
Executing inside a jupyter notebook
START execute_CPU_BOUND_preprocessing 0
indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004986286163330078
73514 double entry a system of accounting in which every transaction is recorded twice as a debit and as a credit earnings per share a company s net profit after taxes for an accounting period divided by the average number of shares of stock outstanding during the period
73515 goodwill in accounting the difference between what a company pays when it buys the assets of another company and the book value of those assets sometimes real goodwill is
73516 best answer the sperm can actually survive for up to 7 days inside the woman s reproductive tract but this is very very rare more common is about three days the length of their life inside the female body really depends on the environment or the cervical mucus being made and how strong the sperms are
73517 the right environmen

indice 0 lunghezza array (indice)0 10
END execute_CPU_BOUND_preprocessing 0
10
Processati: 10 tempo:0.004985809326171875
75593 how much does a smog check cost in california i have a 2003 honda civic coupe and am needing to pay registration renewal it says smog certification required as well but i feel like i just got a smog check how much is the smog check on top of the 99 registration renewal
75594 the smog check program is an important part of the state s efforts to improve the air we breathe smog check inspections are designed to identify vehicles with excess emissions so they can be properly repaired the program has greatly reduced air pollution created by millions of cars in california
75595 47 as noted in this chapter the stress response has immediate intermediate and prolonged effects to reinforce yourunderstanding of each type re ect on how your body reacts to stress through these three processes 1
75596 this results in an increase in heart rate blood pressure and breathing rat

KeyboardInterrupt: 

In [66]:
# import time
# start_time = time.time()
# array_orig=[]
# array=[]
# temp=[]
# try:
#     tp=text_proc.TextProcessor(False)
#     with gzip.open("C:/Users/Davide/IR/collection.tar.gz", 'rt', encoding='utf-8') as gzipped_file:
#         # Read the first three lines
#         i=0
#         start_time_loop = time.time()
        
#         #str_buffer = StringIO()
#         str_array=[]
        
#         while(True):
#             #start_time = time.time()
            
#             line = gzipped_file.readline()
            
#             if (line==""):
#                 print("Fine lettura")
#                 break
            
#             array_orig.append(line)
            
            
#             #str_buffer.write(line+"symbfinestringasymb ")
#             str_array.append(line)
#             if (i>=2224 and i<2270):
#                 print(line+" symbfinestringasymb ")
            
            
#             #array.append(tp.process_text(line))

#             i+=1
#             if (i%5000==0):
                
#                 #result_string = ''.join(str_buffer.getvalue())
#                 #array.append(tp.process_text(result_string))
                
#                 for stringa in str_array:
                
#                     pre_processed=tp.process_text(stringa)
#                     if (pre_processed.strip()!=""):
#                         array.append(pre_processed)
#                         #array.extend([doc.strip()+"\n" for doc in pre_processed.split("symbfinestringasymb") if doc.strip()!=""])
                
#                 with open("bbb.txt", 'w',encoding='utf-8') as file:
                    
#                     for elem in array:
                        
#                         file.write(elem+"\n")
                
#                 break
                
#                 end_part_loop = time.time()
                
#                 str_buffer.close()
#                 str_buffer= StringIO()
                
#                 print("Processati: "+str(i)+" tempo:"+str(end_part_loop-start_time_loop))
#                 start_time_loop = time.time()
            
#             #end_time = time.time()
#             #print("tempo doc_"+str(i)+" "+str(end_time)+": "+str(start_time))
#             #print("tempo doc_"+str(i)+" "+str(end_time-start_time))
            
# except FileNotFoundError:
#     print(f"File not found: {file_path}")
    
# end_time = time.time()
# print("tempo necessario: "+str(end_time-start_time))


2224	14.6% of the population in Onslow County, NC (170,347 people) live below the poverty line, a number that is approximately the same as the national average of 14.7%. The largest demographic living in poverty is Female 25-34, followed by Female 18-24 and then Male < 5.
 symbfinestringasymb 
2225	Wage by Gender for Common Jobs. 1  Showing data for North Carolina. 2  Showing data for North Carolina. 3  The closest comparable wage GINI for Onslow County, NC is from North Carolina. 4  Use the dropdown to filter by race/ethnicity. 5  For anonymity, the ACS 1-year estimate groups occupations by course parent groupings.
 symbfinestringasymb 
2226	Population of Onslow County. Last official estimated population of Onslow County was 179,563 (year 2014) [1]. This was 0.056% of total US population and 1.8% of total North Carolina state population. Area of Onslow County is 905.9 miÂ² (=2346.3 kmÂ²), in this year population density was 198.21 p/miÂ². If population growth rate would be same as in 

TypeError: 'str' object is not callable

In [40]:
# indice=2224
# print (array_orig[indice])
# print (array[indice])

2224	14.6% of the population in Onslow County, NC (170,347 people) live below the poverty line, a number that is approximately the same as the national average of 14.7%. The largest demographic living in poverty is Female 25-34, followed by Female 18-24 and then Male < 5.

2224 14 6 of the population in onslow county nc 170 347 people live below the poverty line a number that is approximately the same as the national average of 14 7 the largest demographic living in poverty is female 25 34 followed by female 18 24 and then male 35 of the panda s bamboo habitat in the next 80 years and thus the panda population is projected to decline fan et al 2012 songer et al 2012 tuanmu et al 2013 li r et al 2015
