In [1]:
# !pip install nltk
# !pip install textblob

## 1) Loading The Text Data 

In [2]:
import pickle
import os
import re
import nltk
# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download('wordnet')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
# nltk.download('averaged_perceptron_tagger')
import numpy as np
import time
import torch

In [3]:
data_variants = [
    "Sentences_50Agree",
    "Sentences_66Agree",
    "Sentences_75Agree",
    "Sentences_AllAgree",
]


In [4]:
folder_path = "/home/charanubuntu/NLP/FinancialPhraseBank-v1.0/FinancialPhraseBank-v1.0/"
files = os.listdir(folder_path)

for i in files:
    print(i)

Sentences_75Agree.txt
Sentences_66Agree.txt
Sentences_AllAgree.txt
README.txt
Sentences_50Agree.txt
License.txt


In [5]:
chosen_variant = data_variants[2] + ".txt"

sentences = []

label_map = {"positive" : 1 ,"neutral" : 0 , "negative" : -1}

labels = []

if chosen_variant in files :
    
    file_path = os.path.join(folder_path,chosen_variant)
    
    with open(file_path,encoding="iso-8859-1") as file :
        
        for id_,line in enumerate(file):
            x = re.split("@",line)
            
            curr_label = re.split(r"\n",x[1])
            
            sentences.append(x[0])
            
            labels.append(label_map[curr_label[0]])
            
                
else :
    print("No file")

In [6]:
# Collecting the sentences and corresponding labels 

sent_arr = np.array(sentences)

label_arr = np.array(labels)

In [None]:
test_sent = sent_arr[]

## 2) Text Pre Processing Steps

In [7]:

# Remove Punctuations

def remove_punctuation(sentences):
    
    for i in range(len(sentences)):
        
       sentences[i] =  re.sub("\W"," ",sentences[i])

    return sentences

# Remove the stopwords

def remove_stopwords(sentences):
    
    stop_words = stopwords.words('english')
    
    
    for i in range(len(sentences)) :
        
        filtered_words = [' ' + word for word in sentences[i].split() if word.lower() not in stop_words]
        
        sentences[i] = ''.join(filtered_words)
    
    return sentences


# Correcting the sentences 

def spell_check(sentences):
    
     for i in range(len(sentences)):
        
       sentences[i] = str(TextBlob(sentences[i]).correct())

     return sentences
    
        
# Tokenization 

def tokeniser(sentence):
    
    words = re.split("\s+",sentence)
    
    return words


# Convert Capital to Small Letters


def lemmatize_sent(sentence):
    
    lemmatizer = WordNetLemmatizer()
    
    words = tokeniser(sentence)
    
    lem_tokens = []
    
    for word in words :
        
        word_pos = pos_tag([word])[0][1]
        
        pos_tag_map = {
            'N': 'n',  # Noun
            'V': 'v',  # Verb
            'J': 'a',  # Adjective
            'R': 'r'   # Adverb
        }
        
        
        pos = word_pos[0].upper()
        
        if pos in pos_tag_map:
            lem = lemmatizer.lemmatize(word,pos = pos_tag_map[pos])
        else :
            lem = lemmatizer.lemmatize(word,pos = 'n')
        
        lem_tokens.append(lem.lower())
    
    return lem_tokens
     

def lower_case(tokens):
    
    for i in range(len(tokens)) :
         tokens[i] = tokens[i].lower()
    
    return tokens


# Lemmatizing the tokens -> eg : running ,runs -> run, it is a process of grouping together different inflected forms of the same word.


In [8]:
sent_without_punct = remove_punctuation(sent_arr) # Removed Punctuations 


In [9]:
# spell_check_sentences = spell_check(sent_without_punct) # Correcting the spell mistakes
   

In [10]:
# Processed the sentences without any stop words

# sentences_without_stp_wrds = remove_stopwords(spell_check_sentences)


In [11]:
# Lemmatization

lem_tokens_sent = []

for line in sent_without_punct :
        
   lem_tokens_sent.append(lemmatize_sent(line))


In [12]:
new_tok_sent = [sent[:-1] for sent in lem_tokens_sent]

# Lower Casing the tokens 
tok_sent = [[word.lower() for word in sent] for sent in new_tok_sent]

In [13]:
tok_sent[0]

['according',
 'to',
 'gran',
 'the',
 'company',
 'have',
 'no',
 'plan',
 'to',
 'move',
 'all',
 'production',
 'to',
 'russia',
 'although',
 'that',
 'be',
 'where',
 'the',
 'company',
 'be',
 'grow']

In [14]:
# Tokenization

tokens = []

for sent in tok_sent :
    
    for word in sent :
        
        tokens.append(word)
     
tokens = np.array(list(set(tokens)))

In [15]:
print(len(tokens))

7162


## 3) Saving the contents to a File

In [16]:
# We will save lem_tokens_sent and labels 

preprocessed_data = {
    "lem_tokens_sent": tok_sent,
    "Vocabulary" : tokens,
    "labels": labels,   
}


with open("preprocessed_data.pkl", "wb") as file:
    pickle.dump(preprocessed_data, file)
