In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "/content/inputData"

## Preprocess Data

#### File information

In [3]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['f3.txt', 'f1.txt', 'f2.txt'] 
 ['/content/inputData/f3.txt', '/content/inputData/f1.txt', '/content/inputData/f2.txt']


In [4]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'/content/inputData/f3.txt': 'Anglo-French Channel Tunnel operator Eurotunnel on Monday announced a deal giving creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt mountain.\nThe long-awaited restructuring brings to an end months of wrangling between Eurotunnel and the 225 banks to which it owes nearly nine billion pounds.\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of insolvency but leaves shareholders owning only 54.5 percent of the company.\n"The restructuring plan provides Eurotunnel with the medium term financial stability to allow it to consolidate its substantial commercial achievements to date and to develop its operations," Eurotunnel co-chairman Alastair Morton said.\nThe firm was now making a profit before interest, he added.\nAlthough shareholders will see their interests diluted, they were offered the prospect of a brighter future after months of uncertainty 

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [5]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [6]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [7]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [8]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)

    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [9]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [10]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

Anglo-French Channel Tunnel operator Eurotunnel on Monday announced a deal giving creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt mountain.
The long-awaited restructuring brings to an end months of wrangling between Eurotunnel and th


In [11]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'on', 'Monday', 'announced', 'a', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'of', 'the', 'company', 'in', 'return', 'for', 'wiping', 'out', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion']


In [12]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'company', 'return', 'wiping', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion', ')', 'debt', 'mountain', '.', 'The', 'long-awaited', 'restructuring']


In [13]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '(', '$', '1.56', 'billion', ')', 'debt', 'mountain', '.', 'the', 'long-await', 'restructur']


In [14]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'mountain', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl']


In [15]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'mountain', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl']


### Wrap into a function to be used by NLTK

In [16]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [23]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
  print(f"{'':<20}", end="")
  for fileName in fileNames:
    print(f"{fileName:<20}", end="")
  print()


  for i, term in enumerate(term):
      print(f"{term:<10}\t|  ", end="")
      for fileVal in values[:, i]:
          print(f"{fileVal:<20.12f}", end="")
      print()


In [24]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all(term, values, fileNames):
    filePath = '/content/inputData/tfidf_errors.txt'
    with open(filePath, 'w', encoding='utf-8') as f:
        f.write(f"{'':<20}")
        for fileName in fileNames:
            f.write(f"{fileName:<20}")
        f.write("\n")

        for i, term in enumerate(term):
            f.write(f"{term:<10}\t|  ")
            for fileVal in values[:, i]:
                f.write(f"{fileVal:<20.12f}")
            f.write("\n")


In [25]:
# TODO: modify this to build matrix then print from matrix form
def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    #print(cosine_similarity(tfs[0], tfs[1]))
    cosine_sim = cosine_similarity(tfs)
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")

    print(f"{'Files':<20}", end="")
    for fileName in fileNames:
        print(f"{fileName:<20}", end="")
    print()

    for i, fileName in enumerate(fileNames):
        print(f"{fileName:<20}", end="")
        for j in range(len(fileNames)):
            print(f"{cosine_sim[i][j]:<20.12f}", end="")
        print()


    print("\n\n=============================================================================================\n")

In [26]:
def calc_and_write_CosineSimilarity_for_all(tfs, fileNames):
    filePath = '/content/inputData/cosine_similarity_errors.txt'
    cosine_sim = cosine_similarity(tfs)

    with open(filePath, 'w', encoding='utf-8') as f:
        f.write("\n\n========COSINE SIMILARITY====================================================================\n")

        f.write(f"{'Files':<20}")
        for fileName in fileNames:
            f.write(f"{fileName:<20}")
        f.write("\n")

        for i, fileName in enumerate(fileNames):
            f.write(f"{fileName:<20}")
            for j in range(len(fileNames)):
                f.write(f"{cosine_sim[i][j]:<20.12f}")
            f.write("\n")

        f.write("\n\n=============================================================================================\n")


## Wrap Everything into `Main()`

In [27]:
def main(printResults=True):
    baseFolderPath = "/content/inputData"

    fileNames, filePathList = returnListOfFilePaths(baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    # calculate tfidf
    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names_out()

    if printResults:
        # print results
        print_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_print_CosineSimilarity_for_all(tfs, fileNames)
    else:
        # write results to file
        write_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_write_CosineSimilarity_for_all(tfs, fileNames)

In [28]:
main()

                    f3.txt              f1.txt              f2.txt              
'm        	|  0.000000000000      0.093773490261      0.000000000000      
's        	|  0.186300933135      0.110768267918      0.177939521378      
1.0       	|  0.037260186627      0.027692066980      0.035587904276      
1.56      	|  0.047979274336      0.000000000000      0.091651812664      
1.6       	|  0.000000000000      0.046886745130      0.000000000000      
1.85      	|  0.000000000000      0.046886745130      0.000000000000      
10        	|  0.037260186627      0.027692066980      0.035587904276      
10.40     	|  0.037260186627      0.027692066980      0.035587904276      
113.5     	|  0.037260186627      0.027692066980      0.035587904276      
13.6      	|  0.000000000000      0.000000000000      0.060255559786      
130       	|  0.037260186627      0.027692066980      0.035587904276      
14.1      	|  0.000000000000      0.000000000000      0.060255559786      
150       	|  0.000