<a href="https://colab.research.google.com/github/FDDI-CentOS/data/blob/master/iAtk_Key_Word_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Internal Audit Tool Kit (iAtk): Key Word Analysis Tools**
The tools below perform basic frequency and iteration analysis on key words from GDOC and GSHEET files store in Google Drive.

In [2]:
# Internal Auditor Tool Kit (iATK) v.1.2
# djarguello@ 5-26-19

# Setup runtime environment packages, only need to run once
!pip install pandas



In [0]:
import pandas as pd
import string
import re

def menu():
    print("-----------------------------------------------------")
    print("--  Welcome to the Internal Audit Tool Kit (iAtk)  --")
    print("-----------------------------------------------------")
    program = str(input("Please select which tool you would like to run: \n1) Keyword Analysis Tool \n2) Unique Word Counter Tool \n>>> "))
    if program == '1':
        keyword_analysis()
    elif program == '2':
        word_count_instances()
    else:
        print("{} is an invalid input, please rerun the program and try again.\n".format(program))
        print("\nThank you for using the Internal Auditor Tool Kit (AKA i-Atk)!")
        exit()

In [0]:
# Function to determine default or target source file location

def source():
    handle = str(input("Use the default path to import your source file for analysis? (/Users/Shared/import.txt) Y/N:"))

    if handle.upper() == 'Y':
        try:
            path = "/Users/Shared/import.txt"
        except FileNotFoundError:
            print("File or folder location not found.")
            exit()
        return (path)

    elif handle.upper() == 'N':
        try:
            path = str(input(
                "Enter the path and file name you wish to import for analysis (Example: /Users/Shared/import.txt):"))
        except FileNotFoundError:
            print("File or folder location not found.")
            exit()
        return (path)

    else:
        print("Unsupported selection, closing application please try again.")
        exit()

    return(path)

In [0]:
# Function to determine default or target destination file locations

def destination():
    handle = str(input("Output your results to the default path? (/Users/Shared/output.txt) Y/N:"))

    if handle.upper() == 'Y':
        try:
            path = "/Users/Shared/output.txt"
        except FileNotFoundError:
            print("File or folder location not found.")
            exit()
        return (path)
    elif handle.upper() == 'N':
        try:
            path = str(input("Enter the path and file name you wish to save your output to (Example: /Users/Shared/output.txt):"))
        except FileNotFoundError:
            print("File or folder location not found.")
            exit()
        return (path)
    else:
        print("Unsupported selection, closing application please try again.")
        exit()

In [0]:
# Key word analysis function searches file and returns all instances of sentences with those words

def keyword_analysis(): 

    path = source()
    output = destination()
    location = str(output)
    keyword = str(input("Enter the keyword you wish to search for: "))

    mylines = []                         # Instantiate an empty list
    counter = 0                          # Counts the number of lines the keyword is found in the file

    if path[-3:] == "txt":
        with open (path, 'rt+') as myfile:    # Open the file for reading text.
            for myline in myfile:                   # For each line in the file,
                mylines.append(myline.rstrip('\n')) # strip newline and add to list.

        for line in mylines:                 # Counter for times keyword is referenced
            if keyword.lower() in line.lower():
                counter += 1

        output = open(output,'w+')
        output.write("The keyword '{}', is referenced in {} of {} lines in the {} file.\n\n".format(keyword,counter,str(len(mylines)),path))

        for line in mylines:                # Print out line index and lines with keyword in it
            if keyword.lower() in line.lower():
                output.write(str(mylines.index(line)) + " " + str(line) +"\n")

    elif path[-3:] == "csv":
        df = pd.read_csv(path)
        df.to_csv(output, index=None, sep="|")

        with open (output, 'rt+') as myfile:    # Open the file for reading text.
            for myline in myfile:                   # For each line in the file,
                mylines.append(myline.rstrip('\n')) # strip newline and add to list.

        for line in mylines:                 # Counter for times keyword is referenced
            if keyword.lower() in line.lower():
                counter += 1

        output = open(output,'w+')
        output.write("The keyword '{}', is referenced in {} of {} lines in the {} file.\n\n".format(keyword,counter,str(len(mylines)),path))

        for line in mylines:                # Print out line index and lines with keyword in it
            if keyword.lower() in line.lower():
                output.write(str(mylines.index(line)) + " " + str(line) +"\n")

    else:
        print("{} is an unsupported file type please try again.".format(path[-3:]))


    # print("Process complete! Please go to your {} file for results.".format(location))
    # print("\nThank you for using the Internal Auditor Tool Kit (AKA i-Atk)!")
    exit()


In [0]:
# Word count function to analyze the number iterations a word is used in a doc

def word_count_instances(): 

    from collections import Counter

    path = source()
    output = destination()
    location = str(output)
    newfile = []
    # file = open(str(path)).read().split()
    #
    # c = Counter(file)  # Counter is a dictionary, use key value pairs to port data out
    # most_occur = c.most_common(20)

    if path[-3:] == "txt":
        file = open(str(path), encoding='utf-8-sig').read().split()
        for line in file:
            line = str(line)
            line = re.sub(r'^"|"$', '', line)
            newfile.append(line)
        print(newfile)

        c = Counter(newfile)  # Counter is a dictionary, use key value pairs to port data out
        most_occur = c.most_common(20)

    elif path[-3:] == "csv":
        df = pd.read_csv(path)
        df.to_csv(output, index=None, sep=" ")
        file = open(str(output), encoding='utf-8-sig').read().split()
        file = list(filter(None, file))
        print(type(file))
        for line in file:
            line = str(line)
            line = re.sub(r'^"|"$', '', line)
            newfile.append(line)
        print(newfile)

        c = Counter(newfile)  # Counter is a dictionary, use key value pairs to port data out
        most_occur = c.most_common(20)

    else:
        print("{} is an unsupported file type please try again.".format(path[-3:]))

    with open(output, 'w') as f:
        f.write("This Output File is '|' Delimited\n")
        f.write("Here are the top 20 most occurring words\n")
        f.write("Word | Count\n")
        for line in most_occur:
            line = str(line)
            f.write(line + "\n")
        f.write("\n------------------------------------------------------\n")
        f.write("\n\nBelow is the full count of all word instances in {}\n".format(path))
        f.write("Note the Raw Data Output is '|' Delimited\n\n")
        f.write("Word|Count\n")
        sorted(c)
        for k, v in c.items():
            f.write(str(k)+ "|" + str(v) +"\n")

    print("Process complete! Please go to your {} file for results.".format(location))
    print("\nThank you for using the Internal Auditor Tool Kit (AKA i-Atk)!")
    exit()

In [0]:
# Calls Menu Function to Launch Application

menu()