   # Mini Projet Indexation Web 

## Import modules

In [138]:
from collections import Counter
import numpy as np 
import re #For regular expressions
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import os 
import time
from collections import defaultdict
import pickle 
import gzip
import operator 
from multiprocessing import Pool
import glob
from functools import reduce
import warnings
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix


# Global variables

In [4]:
nltk.download("stopwords")
nltk.download("wordnet")
lem = WordNetLemmatizer()
stops = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Loading and cleaning data (texts)

## First, get the list of paths to all data files and then read them


In [8]:
list_rep=glob.glob("data/**/*.txt",recursive=True)

def readText(path):
'''
Takes a path and returns the corresponding file as python string
'''
    with open(path, "r") as f:
        return f.read()

### Paralellized loading

In [11]:

start = time.time()
with Pool(8) as pool:
    list_text = pool.map(readText, list_rep)
end = time.time()
print(end - start)

0.14348483085632324


### Loading with for loop

In [12]:
#Loading data in a list
start=time.time()
list_text = []
for path in list_rep:
    with open(path, 'r') as f:
        text = f.read()
        list_text.append(text)
end=time.time()
end-start

0.058347463607788086

# Pre-processing
1. Tokenization
    - Lower text case
    - Remove stopwords
    -Apply lemmatization 
2. Building the vocabulary: words and frequencies

## Tokenization

In [13]:
def cleanText(text, stopwords=stops, lem = lem):
    
    """Take a text file and put it in lower case
    then remove stopwords,
    and apply stemming to each word 
    """
    text = re.findall("[a-zA-z0-9]+", text.lower())
    return [lem.lemmatize(elt) for elt in text if elt not in stops]

In [15]:
start = time.time()
with Pool() as pool:
    list_processed_texts = pool.map(cleanText, list_text)
end = time.time()
end-start

3.764699935913086

In [16]:
start = time.time()
list_processed_texts = list(map(cleanText, list_text))
end = time.time()
end-start
####Slower than map with pool

3.967571258544922

## Building the Vocabulary

In [18]:
def vocabOneFile(termlist):
    """
        This function takes a list of words and return a dictionary
        with frequency for each word
    """
    return Counter(termlist)

def vocabGlobal(termlists):
    """
    This function is the generalization of vocabOneFile which takes a list of words and return a dictionary
        with frequency for each word. So it returns global vocabulary(words,frequencies) of a list of documents.
    """
    myCounter = Counter()
    for text in termlists:
        myCounter += Counter(text)
    return dict(myCounter)

In [20]:
start = time.time()
with Pool(8) as pool:
    vocabulary = pool.map(vocabOneFile, list_processed_texts)
vocabulary = dict(reduce(lambda x,y:x+y, vocabulary))
end = time.time()
end-start

21.24228000640869

In [23]:
start = time.time()
global_vocabulary = vocabGlobal(list_processed_texts)
end = time.time()
end-start
####faster than map with pool

2.1786749362945557

# Building the index

In [25]:

def index_one_file(termlist):
    """
    This function take a list of terms and a dictionnary containing word as key and their positions as value.
    input : doc = [word1, word2, ...]
    output = {word1: [pos1, pos2], word2: [pos2, pos434], ...}
    """
    fileIndex = defaultdict(list)
    for index, word in enumerate(termlist):
        fileIndex[word].append(index)
    return dict(fileIndex)

In [112]:
def comptime(func,args,n): #args is the list of arguments for func in the order
    t=0
    s=time.time()
    for i in range(n): 
        tmp=func(*args)
    e=time.time()
    return (e-s)/n # Faster than the first implementation.

In [42]:
#With map
s=time.time()
with Pool(8) as pool:
    dic1=pool.map(index_one_file,list_processed_texts)
dic1={k:dic1[k] for k in range(len(dic1))}
e=time.time()
e-s

0.8066937923431396

In [43]:
#### Index with dictionary comprehension : two times Faster than mapReduce
e=time.time()
index={k:index_one_file(list_processed_texts[k]) for k in range(len(list_processed_texts))}
s=time.time()
s-e

0.4753296375274658

In [39]:
dic1[0]

{'0003': [217],
 '0840': [165],
 '10': [4],
 '110': [40],
 '13': [65],
 '14': [9],
 '15': [66, 102],
 '150': [90],
 '160': [91],
 '1995': [47],
 '2': [103, 176, 183, 215],
 '2423': [216],
 '26': [162, 168],
 '3': [128],
 '38': [127],
 '4': [114],
 '42': [214],
 '5': [5, 10, 110],
 '6': [113],
 '7': [41, 50],
 '80': [184],
 '87': [177],
 '9': [111],
 '939': [163],
 '945': [169],
 '95': [49],
 'accounting': [126],
 'added': [118, 211],
 'allowing': [141],
 'also': [147],
 'analyst': [22, 61, 132],
 'august': [12, 107, 117],
 'average': [63],
 'bank': [180],
 'basket': [175],
 'bear': [191],
 'benefit': [206],
 'better': [198],
 'billion': [6, 11, 42, 51, 67, 92],
 'buoyed': [13],
 'bureau': [27],
 'central': [179],
 'certainly': [209],
 'competitive': [143],
 'continued': [125],
 'crown': [7, 43, 52, 68, 93, 146, 160, 205],
 'csu': [28, 97],
 'currency': [174],
 'czech': [1, 25],
 'data': [29],
 'dealer': [203],
 'deficit': [3, 48, 58, 94, 196],
 'deviation': [172],
 'digested': [158],
 

In [139]:
index[0]

{'0003': [217],
 '0840': [165],
 '10': [4],
 '110': [40],
 '13': [65],
 '14': [9],
 '15': [66, 102],
 '150': [90],
 '160': [91],
 '1995': [47],
 '2': [103, 176, 183, 215],
 '2423': [216],
 '26': [162, 168],
 '3': [128],
 '38': [127],
 '4': [114],
 '42': [214],
 '5': [5, 10, 110],
 '6': [113],
 '7': [41, 50],
 '80': [184],
 '87': [177],
 '9': [111],
 '939': [163],
 '945': [169],
 '95': [49],
 'accounting': [126],
 'added': [118, 211],
 'allowing': [141],
 'also': [147],
 'analyst': [22, 61, 132],
 'august': [12, 107, 117],
 'average': [63],
 'bank': [180],
 'basket': [175],
 'bear': [191],
 'benefit': [206],
 'better': [198],
 'billion': [6, 11, 42, 51, 67, 92],
 'buoyed': [13],
 'bureau': [27],
 'central': [179],
 'certainly': [209],
 'competitive': [143],
 'continued': [125],
 'crown': [7, 43, 52, 68, 93, 146, 160, 205],
 'csu': [28, 97],
 'currency': [174],
 'czech': [1, 25],
 'data': [29],
 'dealer': [203],
 'deficit': [3, 48, 58, 94, 196],
 'deviation': [172],
 'digested': [158],
 

# Building the Inverted Index

## First implementation
Create inverted index from list of list_processed_texts

In [54]:
def create_inv_index(data):
    index = defaultdict(list)
    res={}
    
    for i, words in enumerate(data):
        for word in words:
            index[word].append(i)
    index_new = {}
    for word in index.keys():
        index_new[word] = dict( Counter( index[word] ) )
        for doc_num in index_new[word].keys():
            pos = np.array((np.where(np.array(data[doc_num])==word))).tolist()
            index_new[word][doc_num] = {"occurencies":index_new[word][doc_num], "positions":pos[0], "doc_size":len(data[doc_num])}
        res[word] = index_new[word]
            
    return res

In [55]:

start = time.time()
test_Index = create_inv_index(list_processed_texts)
end = time.time()
end-start

17.024723291397095

## Second implementation
Create inverted index from the normal index.

In [185]:
def inverted_index(index):
    """
    This function takes a normal index and return the inverted index. More details below:
    input = {doc_id: {word: [pos1, pos2, ...], ... }}
    output = {word1: {doc_id:{doc_size,[pos1, pos2],freq},....}, ..., ...}
    """
    inv_index = defaultdict(dict)
    for doc_id in index.keys():
        doc_size=sum([len(list_pos) for list_pos in index[doc_id].values()]) #computes doc_size from list of positions in indexe
        for word in index[doc_id].keys():

            inv_index[word][doc_id]={'doc_size':doc_size,
                              'positions':index[doc_id][word],
                              'occurencies':len(index[doc_id][word])}
    return dict(inv_index)

In [186]:
t=0
s=time.time()
for i in range(10): 
    inv_index=inverted_index(index)
e=time.time()
(e-s)/10 # Faster than the first implementation.

0.8610539674758911

In [187]:
comptime(inverted_index,index,100)

TypeError: inverted_index() takes 1 positional argument but 2500 were given

In [106]:
# inv_index["china"]
# *[1,2]

# Serialize inverted index with *pickle* and *gzip*

In [58]:
#store the object
with gzip.open("Index_Articles.pklz", "wb") as fout:
    pickle.dump(inv_index,fout)
    

#restore the object
with gzip.open("Index_Articles.pklz", "rb") as fin:
    indexe_pkl= pickle.load(fin)

In [None]:
indexe_pkl["china"]

# Create Search functions

## Single-word Queries - Function *sing_woq(word,index)*
In which documents does a given word occur?
To do that, we check if the word is in inverted index keys and return the corresponding documents otherwise the function returns None 

In [59]:
def sing_woq(query, index):
    """
    This function takes a query, tokenizes it and then checks if the word is in the
    inverted index keys and return the corresponding documents;
    otherwise the function returns None 
    """
    query = cleanText(query)
    if len(query)==1 and query[0] in index.keys(): ### ensure the query is a single word and exists in the vocab
        return list(index[query[0]].keys())
    elif len(query)>1 : ##The user provided more than one word
        warnings.warn("Please provide a single word!")
        return None
    else : return None

In [64]:
query1 = sing_woq("tournament", indexe_pkl)
query1

[5, 16, 17, 22, 25, 26, 29, 37, 42, 846, 924, 1005, 1020, 1930, 1945]

In [65]:
sing_woq("the", indexe_pkl)

## Free-text Queries
* Which documents contain at least one word from a given list of words?

We split the query in a list of words and then apply the previous function to each word.
The returned object is the union of results.

In [66]:
def free_tq(query, index):
    
    """
        This function takes free text as query and return a list of all docs containing at least one term
        from the query
    """
    
    list_words=query.split()
    res = [id_doc for word in list_words if sing_woq(word, index) for id_doc in sing_woq(word, index) ]
    if len(res)>0:
        return sorted(list(set(res)))
    else:
        print("No match found!!!")
        return None
    
    

In [68]:
free_tq("The british tournament ", inv_index)[:10]

[1, 5, 16, 17, 22, 25, 26, 29, 37, 42]

In [108]:
comptime(free_tq,["the british",inv_index],10)

0.0001119852066040039

## Phrase Queries
* Which documents contain a given phrase in the same order?


1. We're looking for the texts belonging to the intersection of sing_woq for every word in the query text.
2. Then we check whether they are in correct order or not.

In [69]:
def phrase_query(query, index):
    """
        This function takes a phrase and return documents containing all the phrase's terms in the same order.
        1. We're looking for the texts belonging to the intersection of sing_woq for every word in the query text.
        2. Then we check whether they are in correct order or not.
    """
    list_words = cleanText(query) #cleaning query
    res = set()
    final_result = []
    
    #### Retrieve the list of docs containing all the terms in the query
    l=[set(sing_woq(word,index)) for word in list_words] #we use set because list doesn't recognize intersection as method
    ids=list(set.intersection(*l)) #gets id of documents containing all words in the query
    
    
    #Check whether terms are in correct order
    if len(ids) > 0: # If there is at least one document containing all words in the query
        
        # getting positions of words in documents
        for id_doc in ids: 
            posMinusOne = []
            list_pos=[index[word][id_doc]["positions"] for word in list_words] 
            for i, lpos in enumerate(list_pos) : #Check whether terms are in correct order
                posMinusOne.append( set([x-i for x in lpos]) )
                
            if (len(set.intersection(*posMinusOne)) > 0):
                final_result.append(id_doc)
                
    return final_result


In [None]:
list_text[533]

In [72]:
query3 = phrase_query("who will head the enlarged group.\nAdvance Bank shareholders would be offered a combination of A$2.10 in cash, a 20 cent special cash dividend and new St George shares up to a value of A$5.00", inv_index)
query3

[533]

# Ordering Queries  according to different metrics

## Ordering single word queries by term frequency in the documents

In [98]:
def sing_woq_TF(query,ind):
    """
        This fucntion takes a single word as query and return all the documents containing the query 
        ordered by the term's frequency in theses docs
    """
    
    query = cleanText(query) ##cleaning the query
    if len(query)==1 and query[0] in ind.keys():
        word=query[0]
        d={doc:ind[word][doc]["occurencies"] for doc in ind[word].keys()} # A dict containing the document id as key and the frequency of the word in the document
        
        sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True) #Sorted list of tuples (doc_ids,frequency) in descending order
        return sorted_d
    
    elif len(query)>1: ##The user provided more than one word
        warnings.warn("Please provide a single word !!!")
        return None
    
    else:
        print("Word not found !!!")
        return None

In [122]:
comptime(sing_woq_TF,["china",inv_index],100)

0.00023764848709106445

## Ordering single word queries by weighted term frequency (normalized by the doc size)

In [None]:
def sing_woq_TF_weighted(ind,query):
    """
        This fucntion takes a single word as query and return all the documents containing the query 
        ordered by the term's normalized frequency in theses docs
    """
    query = cleanText(query)
    if len(query)==1 and query[0] in ind.keys():
        word=query[0]
        d={doc:ind[word][doc]["occurencies"]/ind[word][doc]["doc_size"] for doc in ind[word].keys()} # A dict containing the document id as key and the frequency of the word in the document
        import operator 
        sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True) #Sorted list of tuples (doc_ids,frequency) in descending order
        return sorted_d
    
    elif len(query)>1: ##The user provided more than one word
        warnings.warn("Please provide a single word !!!")
        return None

    else:
        print("Word not found !!!")
        return None



In [133]:
sing_woq_TF_weighted(inv_index,"french")[:10]

[(1025, 0.036312849162011177),
 (1002, 0.03571428571428571),
 (1030, 0.033112582781456956),
 (1040, 0.028735632183908046),
 (1022, 0.02702702702702703),
 (1044, 0.023872679045092837),
 (1434, 0.023255813953488372),
 (1033, 0.022813688212927757),
 (1014, 0.022653721682847898),
 (1429, 0.022284122562674095)]

In [134]:
((list_processed_texts[1025].count("french")/len(list_processed_texts[1025])),
(list_processed_texts[1002].count("french")/len(list_processed_texts[1002])),
((list_processed_texts[1030].count("french")/len(list_processed_texts[1030]))))

(0.036312849162011177, 0.03571428571428571, 0.033112582781456956)

## Ordered queries weighted by tf_idf 

In [136]:
def sing_wq_tf_idf(query, index):
    """
        This fucntion takes a single word as query and return all the documents containing the query 
        ordered by the term's frequency normalized by term_freq*inverse_doc_freq  in theses docs
    """
    query = cleanText(query)
    
    if len(query)==1 and len(list(index[query[0]].keys())): ##Check if the query is contained in the index
        tmp = list(index[query[0]].items())
        N = len(tmp) ### Number of documents in which the query appears
        D = len(index) ##size of our index
        tmp = sorted( list(map(lambda x:(x[0], 
                                         (x[1]["occurencies"]/x[1]["doc_size"])*np.log(D/(1+N)),
                                        ), tmp)),
                     key=lambda a:a[1], reverse=True)
        return list(tmp)
    
    elif len(query)>1: ##The user provided more than one word
        warnings.warn("Please provide a single word !!!")
        return None

    else:
        print("Word not found !!!")
        return None

In [137]:
sing_wq_tf_idf("economy", inv_index)[:10]

[(427, 0.092865052631146416),
 (756, 0.071893983647055223),
 (1785, 0.070586820308017856),
 (1153, 0.069274797625712689),
 (1166, 0.067911518080600267),
 (421, 0.067193223177824696),
 (404, 0.065309301032652034),
 (657, 0.065157064899708794),
 (401, 0.063993545883642561),
 (158, 0.063818221100399711)]

# Vector Space Model - VSM

In [188]:
corpus=sorted(inv_index.keys()) #get words from texts in alphabetic order 
len(sorted(inv_index.keys()))>len(global_vocabulary)

False

In [189]:
corpus2=sorted(global_vocabulary.keys())

In [190]:
[word for word in corpus if word not in corpus2 ]

[]

In [191]:
def pre_sparse(doc_id=None,text=None):
    """
    This function take either a id of document or query(text) and return its a list of 3 elements [data, row, col] 
    needed for creating its sparse representation:
    data: sparse matric none null elements ie frequencies
    col: corresponding column for element in data
    row: corresponding row for element in data
    
    """
    data = []; row = []; col = []
    
    if doc_id and not text:
        for word in index[doc_id].keys():
            j=corpus.index(word)
            data.append(len(index[doc_id][word]))
            row.append(doc_id)
            col.append(j)
        return [data, row, col]
    if text and not doc_id and len(cleanText(text))>0:
        text=cleanText(text)
        for word in text:
            if word in corpus:
                j=corpus.index(word)
                data.append(text.count(word))
                col.append(j)
                row.append(0)   
        return [data, row, col]
    if (doc_id and text):
        warnings.warn("This function requires one argument")
        return None
    if not doc_id and not text:
        warnings.warn("This function requires one argument")
        return None

In [192]:
def pre_sparse_doc(doc_id):
    """
    A version of pre_sparse() for document. Needed in map and reduce
    """
    data = []; row = []; col = []
    for word in index[doc_id].keys():
        j=corpus.index(word)
        data.append(len(index[doc_id][word]))
        row.append(doc_id)
        col.append(j)
    return [data, row, col]



def pre_sparse_text(text):
    """
    A version of pre_sparse() text. Needed in map and reduce
    """
    data = []; row = []; col = []
    
    if text and len(cleanText(text))>0:
        text=cleanText(text)
        for word in text:
            if word in corpus:
                j=corpus.index(word)
                data.append(text.count(word))
                col.append(j)
                row.append(0)   
        return [data, row, col]
    else:
        warnings.warn("Review your text")
        return None


In [193]:
# csr_matrix(([1,2],([0,0],[2,3]))).todense()
pre_sparse_doc(doc_id=245)
pre_sparse_text("I'm british")

[[1], [0], [4345]]

In [196]:
def sparse(text=None,nrow=len(list_rep),ncol=len(corpus)): 
    
    """
    This function returns the sparse reprenstation of the its argument text. If text not provided it returns the 
    sparse matrix all documents. 
    """
    
    #if thext is empty the function return sparse matrix for all documents and the corpus
    #nrow=0 if sparsing vector else number of documents
    if text:
        pre_sps=pre_sparse_text(text=text)
        res=csr_matrix((pre_sps[0], (pre_sps[1], pre_sps[2])),shape=(1,ncol))
        return res, normalize(res,norm="l1",axis=1) #normalize is from sklearn
    
    else:
        with Pool(8) as pool:
             data_and_pos= pool.map(pre_sparse_doc,list(range(nrow)))
        add = lambda x,y:x+y
        data= reduce(add,[item[0] for item in data_and_pos],[])
        rows= reduce(add,[item[1] for item in data_and_pos],[])
        cols= reduce(add,[item[2] for item in data_and_pos],[])
        res= csr_matrix((data, (rows, cols)),shape=(nrow,ncol))
        return res, normalize(res,norm="l1",axis=1) #normalize is from sklearn

In [195]:
s1  =time.time()
spm,spm_normalized=sparse()
e1 = time.time()
e1-s1
# list(zip(list(range(21)),repeat(None)))

62.6634566783905

In [197]:
spm.shape

(2500, 26058)

In [198]:
inv_index["american"]
dict(list(inv_index["american"].items())[0:10])

{16: {'doc_size': 282, 'occurencies': 1, 'positions': [180]},
 17: {'doc_size': 222, 'occurencies': 2, 'positions': [125, 170]},
 38: {'doc_size': 174, 'occurencies': 1, 'positions': [126]},
 42: {'doc_size': 292, 'occurencies': 1, 'positions': [194]},
 83: {'doc_size': 357, 'occurencies': 1, 'positions': [345]},
 98: {'doc_size': 364, 'occurencies': 1, 'positions': [354]},
 106: {'doc_size': 247, 'occurencies': 1, 'positions': [27]},
 110: {'doc_size': 242, 'occurencies': 1, 'positions': [121]},
 144: {'doc_size': 215, 'occurencies': 1, 'positions': [124]},
 146: {'doc_size': 213, 'occurencies': 1, 'positions': [122]}}

In [199]:
list_text[55][25:200]

'eutsche Morgan Grenfell of any managers found to bear responsibility for failing to spot irregular dealings by former fund manager Peter Young is expected next week, banking s'

In [200]:
spm_normalized[17,corpus.index("american")],spm[17,corpus.index("american")]

(0.0090090090090090089, 2)

In [201]:
spmvec=sparse(text=list_text[55][25:200])[1]
spmvec.todense()[0,[corpus.index(word) for word in list_text[55][25:200].split() if word in corpus]]

matrix([[ 0.04761905,  0.04761905,  0.04761905,  0.04761905,  0.04761905,
          0.04761905,  0.04761905,  0.04761905,  0.19047619,  0.04761905,
          0.04761905,  0.04761905]])

In [202]:
spm.todense()[140,corpus.index("french")]

5

In [209]:
spm_normalized.todense()[0,:].sum()

1.0

# Serialize the sparse matrices

In [210]:
#store the object
with gzip.open("spm.pklz", "wb") as fout:
    pickle.dump(spm,fout)
with gzip.open("spm_normalized.pklz", "wb") as fout:
    pickle.dump(spm_normalized,fout)
    

#restore the object
with gzip.open("spm_normalized.pklz", "rb") as fin:
    spm_norm_pkl= pickle.load(fin)

In [211]:
spm_norm_pkl.todense()[0,:].sum()

1.0

# Dot product ordering documents

In [223]:
spmvec=sparse(text=list_text[55][25:200])[1]
dotp=spm_normalized.dot(spmvec.transpose()).toarray()
d={k:dotp[k] for k in range(dotp.shape[0])}
sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True)
sorted_d[:10]

[(436, array([ 0.00835623])),
 (55, array([ 0.00824774])),
 (113, array([ 0.00771159])),
 (444, array([ 0.00613497])),
 (268, array([ 0.0058072])),
 (431, array([ 0.00568586])),
 (446, array([ 0.00551572])),
 (432, array([ 0.00532387])),
 (768, array([ 0.0052381])),
 (770, array([ 0.0052381]))]

In [490]:
list_text[55][25:200]
# lem.lemmatize("Management")

'eutsche Morgan Grenfell of any managers found to bear responsibility for failing to spot irregular dealings by former fund manager Peter Young is expected next week, banking s'

In [486]:
list_text[436]

'Hong Kong funds are expected to erect "Chinese walls" between asset management and traders after revelations of unsanctioned trades at the colony\'s biggest fund manager, Jardine Fleming Investment Management.\nCentral dealing, a system that prevents fund managers from executing their own trades, is gaining favour in Asia after last week\'s shocking disclosure of late allocation of trades by one of Hong Kong\'s most prominent fund managers, Colin Armstrong.\n"There is an awareness in the pension (fund) community that this is an issue in Hong Kong," said Gregory Neumann, executive director at Scudder Stevens &amp; Clark Asia Ltd. "I think this could turn out to be a positive for the investment management industry -- no-one will get hired going forward without central dealing."\nRegulators unveiled severe punishment last week for Armstrong\'s actions, which involved delaying the allocation of his trades until the price had changed. Some of the deals involved his own personal trading acc

In [231]:
def free_tqV2(text,spm=spm_normalized,n=10):
    """
    This function takes a text and return the n most correlated documents with it. 
    output: [(doc1, corr1),(doc2,corr2),()...]
    """
    spmvec=sparse(text)[1]
    
    dotp=spm_normalized.dot(spmvec.transpose()).toarray()
    d={k:dotp[k] for k in range(dotp.shape[0])}
    sorted_d=sorted(d.items(), key=operator.itemgetter(1),reverse=True)[:n]
    return [(item[0],item[1][0]) for item in sorted_d] # just to simplify the display 
    

In [232]:
free_tqV2(list_text[55][25:200])

[(436, 0.008356227106227106),
 (55, 0.008247743541861188),
 (113, 0.0077115866589550783),
 (444, 0.0061349693251533744),
 (268, 0.0058072009291521487),
 (431, 0.0056858564321250887),
 (446, 0.0055157198014340872),
 (432, 0.0053238686779059439),
 (768, 0.005238095238095237),
 (770, 0.005238095238095237)]