In [None]:
import nltk
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "/content/inputData"

## Preprocess Data

#### File information

In [None]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['f3.txt', 'f1.txt', 'f2.txt'] 
 ['/content/inputData/f3.txt', '/content/inputData/f1.txt', '/content/inputData/f2.txt']


In [None]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'/content/inputData/f3.txt': 'Anglo-French Channel Tunnel operator Eurotunnel on Monday announced a deal giving creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt mountain.\nThe long-awaited restructuring brings to an end months of wrangling between Eurotunnel and the 225 banks to which it owes nearly nine billion pounds.\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of insolvency but leaves shareholders owning only 54.5 percent of the company.\n"The restructuring plan provides Eurotunnel with the medium term financial stability to allow it to consolidate its substantial commercial achievements to date and to develop its operations," Eurotunnel co-chairman Alastair Morton said.\nThe firm was now making a profit before interest, he added.\nAlthough shareholders will see their interests diluted, they were offered the prospect of a brighter future after months of uncertainty 

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [None]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [None]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [None]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [None]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)

    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [None]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [None]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

Anglo-French Channel Tunnel operator Eurotunnel on Monday announced a deal giving creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt mountain.
The long-awaited restructuring brings to an end months of wrangling between Eurotunnel and th


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'on', 'Monday', 'announced', 'a', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'of', 'the', 'company', 'in', 'return', 'for', 'wiping', 'out', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion']


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['Anglo-French', 'Channel', 'Tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'deal', 'giving', 'creditor', 'banks', '45.5', 'percent', 'company', 'return', 'wiping', 'one', 'billion', 'pounds', '(', '$', '1.56', 'billion', ')', 'debt', 'mountain', '.', 'The', 'long-awaited', 'restructuring']


In [None]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '(', '$', '1.56', 'billion', ')', 'debt', 'mountain', '.', 'the', 'long-await', 'restructur']


In [None]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'mountain', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl']


In [None]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['anglo-french', 'channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'deal', 'give', 'creditor', 'bank', '45.5', 'percent', 'compani', 'return', 'wipe', 'one', 'billion', 'pound', '1.56', 'billion', 'debt', 'mountain', 'the', 'long-await', 'restructur', 'bring', 'end', 'month', 'wrangl']


### Wrap into a function to be used by NLTK

In [None]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [None]:
import pandas as pd
def print_TFIDF_for_all(terms, values, fileNames):
    try:
        tfidf_df = pd.DataFrame(values, columns=terms, index=fileNames)
        print("\n========TF-IDF Table========")
        print(tfidf_df.to_string())
        print("============================\n")
    except ValueError as e:
        print(f"Error while creating DataFrame: {e}")

In [None]:
def write_TFIDF_for_all(terms, values, fileNames, filePath):
    tfidf_df = pd.DataFrame(values, columns=fileNames, index=terms)
    with open(filePath, 'w') as f:
        f.write("\n========TF-IDF Table========\n")
        f.write(tfidf_df.to_string())
        f.write("\n============================\n")
    print(f"TF-IDF values written to {filePath}")

In [None]:
def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    cos_sim_matrix = cosine_similarity(tfs)
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    cos_sim_df = pd.DataFrame(cos_sim_matrix, columns=fileNames, index=fileNames)
    print(cos_sim_df.to_string())
    print("\n=============================================================================================\n")


In [None]:
def calc_and_write_CosineSimilarity_for_all(tfs, fileNames, filePath):
    cos_sim_matrix = cosine_similarity(tfs)
    with open(filePath, 'w') as f:
        f.write("\n========COSINE SIMILARITY========\n")
        cos_sim_df = pd.DataFrame(cos_sim_matrix, columns=fileNames, index=fileNames)
        f.write(cos_sim_df.to_string())
        f.write("\n===============================\n")
    print(f"Cosine Similarity values written to {filePath}")

## Wrap Everything into `Main()`

In [None]:
# Print TF-IDF values in 'table' format
def main(printResults=True):
    baseFolderPath = "/content/inputData"
    fileNames, filePathList = returnListOfFilePaths(baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    tfidf = TfidfVectorizer(stop_words='english')
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Terms = tfidf.get_feature_names_out()
    if printResults:

        print_TFIDF_for_all(tfs_Terms, tfs_Values, fileNames)
        calc_and_print_CosineSimilarity_for_all(tfs, fileNames)
    else:

        write_TFIDF_for_all(tfs_Terms, tfs_Values, fileNames, "/content/tfidf_results.txt")
        calc_and_write_CosineSimilarity_for_all(tfs, fileNames, "/content/cosine_similarity_results.txt")


In [None]:
main()


              10       113        13       130        14      150       160      1997      200      2003     2004       225        24        39        40      400        45        54        56      6393       85  ability      able  acceptable  achievements     added   adding  admitted   afford     agree     ahead  alastair     allow  analyst  analysts     anglo    angry  announced  announcement   annual  approval  arranged    asked  available  avoiding   awaited      bank  bankruptcy     banks  benefits   billion    bonds  brighter    brings     brink  busiest    called   capped     cash  chairman   channel    choose  collapse     come  commercial  companies   company  complex  compromise  conference  considerable  considerably  consolidate  constitute  construction  converted    costs  creditor  creditors  crippling     cross  current      date      deal      debt    debts  december  depreciation  despite   details   develop  difficult   diluted  dividend  doomsday  dwindle     early