In [14]:
!pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.2-cp36-cp36m-win_amd64.whl
Collecting pybind11>=2.2
  Downloading pybind11-2.9.0-py2.py3-none-any.whl (210 kB)
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.0


In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AB078357\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AB078357\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import re
import fasttext
import numpy as np

In [4]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

In [5]:
def tokenize_and_remove_punctuation(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    return tokens

In [6]:
def remove_stopwords(word_tokens):
    filtered_tokens = [word for word in word_tokens if word not in stopwords.words('english')]
    return filtered_tokens

In [7]:
def normalise_text(text):
    sent = text.lower()
    
    re.sub('\n+', ' ', sent)
    re.sub(' +', ' ', sent)
    tokens = tokenize_and_remove_punctuation(sent)
    lemmatized_tokens = lemmatize_sentence(tokens)
    orig = lemmatized_tokens
    filtered_tokens = remove_stopwords(tokens)
    if len(filtered_tokens) == 0:
        # if stop word removal removes everything, don't do it
        filtered_tokens = orig
    return filtered_tokens

In [8]:
import re
def camel_case_split(tokens):
    camel_tokens = []
    for token in tokens:
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', token)        
        token_parts = [m.group(0) for m in matches]
        camel_tokens.append(token_parts)
    return camel_tokens


In [9]:
def train_fasttext(txt = 'all_text_for_ft_train.csv'):
    ft_model = fasttext.train_unsupervised(txt, model='skipgram', lr=0.05, dim=100, ws=5, epoch=5)
    ft_model.save_model("fasttext.model")
    return ft_model

In [10]:
def normalise_title(title):
    title = title.strip(".csv")
    title.replace("_", " ")
    tokens = tokenize_and_remove_punctuation(title)
    
    tokens = camel_case_split(tokens)
    
    if len(tokens)> 1:
        
        tokens = [item for sublist in tokens for item in sublist]
    else:
        tokens = tokens[0]
    return tokens


['Act', '1957_10052012']


In [11]:
def get_title_vector(title_tokens, ft_model):
    vec = np.mean([ft_model[word] for word in title_tokens], axis=0)
    return vec


In [12]:
high_weight_words = {'report':2, 'tp':3, 'target':8, 'price': 4, 'reco':5 , 'recommendation':8 , 'sell':9, 'buy':9, 'hold':9, 'increase':9, 'reduce':9, 'rating':9, 'review':2, 'fiscal':2, 'stock':3, 'share':3}

def get_text_vector(text_tokens, ft_model):
    sum = 0
    '''
    if is_page_text:
        tax = 0.5
    else:
        tax = 1'''
    for token in text_tokens:
        try:
            ft_vec = ft_model[token]
        except:
            ft_vec = [0] * 100
        if token.lower() in high_weight_words.keys():
            #tax = 1 # if any of these words is there, then no tax
            vec = ft_vec * high_weight_words.get(token.lower())
        else:
            vec = ft_vec 
        sum = sum + vec
    if len(text_tokens)>0:
        mean = sum/len(text_tokens)
    else:
        mean = [0] * 100
            
    return mean

In [13]:
def get_doc_vector(text_vec, title_vec):
    doc_vec = np.concatenate((text_vec, title_vec), axis = None)
    return doc_vec

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
def cluster_docs(doc_vecs):
    true_k = 2
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster:", i)
        for ind in order_centroids[i, :20]:
             print(terms[ind])

In [15]:
import csv
def write_all_txt_file(pdf_texts):
    all_text_list = [item for sublist in pdf_texts for item in sublist]            
    file = open('all_text_for_ft_train.csv', 'w+', newline ='\n', encoding = "utf-8")
    with file:
        write = csv.writer(file)
        write.writerows(map(lambda x: [x], all_text_list))
write_all_txt_file([["page1a", "page1a"], ["page2a", "page2b"]])

In [22]:
from os import listdir
from os.path import join
def load_pdf_texts():
    textpath = "needl\\pagewise_folder"
    files = [f for f in listdir(textpath) if f.endswith(".csv")]
    pdf_texts = []
    
    for file in files:
        df = pd.read_csv(join(textpath, file))
        # buy sell target is always in the first page! But we can train Fasttext on more text, so here we can take full text
        pdf_text = df['page_text'].to_list() # list of pages            
        pdf_text = [(re.sub(r'\n+ +', '', str(page))) for page in pdf_text]
        pdf_text = [(re.sub(r'\n|-+|_+', '', page)).strip() for page in pdf_text]
        pdf_text = [(re.sub(r'\t* +', ' ', page)).strip() for page in pdf_text] #list of pages: text of each page in a pdf, comma seperated
        pdf_texts.append(pdf_text)
        
    return files, pdf_texts
#load_pdf_texts()

In [23]:
import pandas as pd
pdf_titles, pdf_texts = load_pdf_texts()
print(len(pdf_texts))
write_all_txt_file(pdf_texts)


590


In [24]:
ft_model = train_fasttext()

In [25]:
index = 0
doc_vecs = []
all_text_tokens = []
all_title_tokens = []
for filename in pdf_titles:
    title_tokens = normalise_title(filename)
    all_title_tokens.append(title_tokens)
    text_tokens = normalise_text(" ".join(pdf_texts[index][:min(len(pdf_texts[index])-1, 2)])) #list of pages in one file--> take first 2 pages as all reports have it in first
    all_text_tokens.append(text_tokens)
    title_vec = get_title_vector(title_tokens, ft_model)
    text_vec = get_text_vector(text_tokens, ft_model)
    doc_vec = get_doc_vector(text_vec, title_vec)
    doc_vecs.append(doc_vec)
    index = index + 1

In [26]:
X = [np.array(doc) for doc in doc_vecs]

In [27]:
np.shape(X)

(590, 200)

In [28]:
X[:2]

[array([ 0.1921955 , -0.12435074, -0.21393298, -0.6243164 ,  0.00340513,
        -0.31052667,  0.18717957,  0.1272794 ,  0.49035332,  0.31263176,
         0.01808154,  0.150024  ,  0.25035203,  0.25768116, -0.18416847,
        -0.05810773,  0.2820143 , -0.24629155,  0.19242777,  0.24783288,
         0.12158884, -0.05855266, -0.062657  ,  0.11838558, -0.419708  ,
        -0.05644286, -0.14761865,  0.4465786 , -0.27247304,  0.10655572,
        -0.24593326,  0.11961447,  0.23427354, -0.09474812,  0.3012488 ,
        -0.11113767, -0.26062927, -0.5127111 , -0.26735786, -0.3927003 ,
        -0.21719971,  0.09702367,  0.02819763, -0.02862808, -0.01238478,
        -0.19762309, -0.02476684, -0.02394044, -0.100161  , -0.26864403,
        -0.15595084, -0.13729201, -0.2451535 , -0.32944354,  0.20419429,
         0.08596294,  0.13392596, -0.27922717, -0.5627646 , -0.24780582,
         0.35519147,  0.11456683,  0.05126396,  0.05663646, -0.6238701 ,
         0.2333658 ,  0.35012138, -0.00503631,  0.0

In [29]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=10)


In [30]:
model.fit(X)

KMeans(n_clusters=2)

In [31]:
# since the vectors are not just of words, we cant print words in each cluster to identify what each cluster signifies
# so we can take 10 manually chosen reports and see in which cluster they fall and use that as reports cluster-- take frequency, one in which max reports fall
manual_reports = ['[Kotak] Godrej Properties, January 29, 2019.pdf', '160125_Huchems_Final (2).pdf', '20141013_IndusInd-Bank-Limited_204_QuarterUpdate.pdf', 'Adani_Power_Q1FY18_results.pdf', 'BP Equities- National Buildings Construction Corporation Ltd NBCC- Short note_11th April, 2014.pdf', 'Exide_4QFY16 Results Review Final.pdf', 'HDFC Securities 16-May-16.pdf', 'IDirect_Greenply_IC.pdf', 'IIFL-+Titan-Gold+standard-ADD.pdf', 'Mahindra and Mahindra Financial Services-result update-Jul-13-EDEL.pdf']


In [32]:
def check(files):
    index = 0
    input_vectors = []
    for file in files:
        if file.endswith(".pdf"):
            file = file.strip(".pdf")+".csv"
        index = pdf_titles.index(file)
        doc_vec = doc_vecs[index]
        input_vectors.append(doc_vec)
    tags = model.predict(input_vectors)
    #all known reports are clustered in 0
    unique, counts = np.unique(tags, return_counts=True)
    print(dict(zip(unique, counts)))
    return tags


In [39]:
check(manual_reports) # Pdfs we surely know are reports by manual checking are falling 0 cluster, so report = cluster 0

{0: 10}


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [40]:
tags = check(pdf_titles)

{0: 571, 1: 19}


In [41]:
import collections

def search_recomendation(tokens):
    tokens = [token.lower() for token in tokens]
    index_dict = {'buy':-1, 'sell':-1, 'hold':-1, 'increase':-1, 'decrease':-1}
    try:
        index_dict['buy'] = tokens.index("buy")
    except:
        pass
    try:
        index_dict['sell'] = tokens.index("sell")
    except:
        pass
    try:
        index_dict['hold'] = tokens.index("hold")
    except:
        pass
    try:
        index_dict['increase'] = tokens.index("increase")
    except:
        pass
    try:
        index_dict['decrease'] = tokens.index("decrease")
    except:
        pass
    
    if not (all(value == -1 for value in index_dict.values())):
        index_dict = {key:val for key, val in index_dict.items() if val != -1} 
        recomendation = min(index_dict, key=index_dict.get)
        return recomendation
    else:   
        return "NA"
        

In [153]:
def search_targetprice(full_text):
    
    matches = re.findall(r"(?i)(target price|TP).*\d+", full_text[0])
    # in between target price and the value, there may be words like "TP amounts to INR 123", or "TP: INR123"
    # but this .* takes everything upto the last digit in text
    if matches != []:
        phrase = matches[0]
        values = re.findall("...\d+", phrase) # in case of "TP '234" there 3 characters to catch the currency -- INR, W, RS etc
        
        if len(values) > 0:
            value = re.sub(': ', '', values[0]).strip()
            ret = value
        
        else:
            try:
                digit_matches = re.findall('\d+', phrase)[0].strip()
                ret = digit_matches[0]
            except:
                ret = "NA"
            
    else:
        ret = "NA"
    return ret

In [156]:
def search_details(filename, index):
    report_info = {}
    full_text = pdf_texts[index]
    text_tokens = all_text_tokens[index]
    title_tokens = all_title_tokens[index]
    recomendation = search_recomendation(text_tokens)
    target = search_targetprice(full_text)
    return recomendation, target

In [162]:
index = 0
score = 0
reco = ['buy', 'sell', 'hold', 'increase', 'decrease']
report_mined = []
recommendation = ''
c = 1
for filename in pdf_titles:
    searched = 0
    #smallest subset
    for token in title_tokens:
        if token in reco:
            recommendation, target = search_details(filename, index)
            searched = 1
            break
    if searched == 0:
        for token in text_tokens:
            if token in high_weight_words.keys():
                recommendation, target = search_details(filename, index)
                
                searched = 1
                break
        if searched == 0:
            if tags[index] == 0: # in reports cluster
                searched == 1
                recommendation, target = search_details(filename, index)
    if searched == 1 and recommendation != "NA":
        report_mined.append({"file": filename.strip(".csv"), "reco": recommendation, "target price": target})
        all_text_tokens[index]
        
    index = index + 1


In [160]:
import json

with open('report_extracted.json', 'w') as file:
    json.dump(report_mined, file, indent = 4 )