<a href="https://colab.research.google.com/github/AndrejsPetrovs/NLP_hw/blob/main/InappropriateLanguageChecker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/AndrejsPetrovs/NLP_hw/main/en2.txt

In [None]:
!pip install nltk

In [None]:
import nltk

nltk.download('punkt')

In [None]:
stemmer = nltk.stem.PorterStemmer()

wordlistFile = "en2.txt"
wordlist = [set() for _ in range(5)]

global ignorelist
ignorelist = set()

categories = ["Completely inappropriate", "Violence/Crime", "Narcotics", "Potentially sensitive topics", "Potentially inappropriate"]

# Fill word lists with the stemmed words, based on the file (using sets for optimization purposes)
with open(wordlistFile) as f:
    for line in f:
        line=line.split()
        word=stemmer.stem(line[0])
        wordlist[int(line[1])].add(word)

def checktxt(txt:str):
    wordcount = 0
    res=[set() for _ in range(5)]

    # Get all words from the text, using nltk
    txt=txt.lower()
    words = nltk.word_tokenize(txt)

    # Check if any of the words correspond to the lists after stemming
    for w in words:
        if len(w)>1:
            wordcount+=1
            if w not in ignorelist:
                w2=stemmer.stem(w)
                for i in range(5):
                    if w2 in wordlist[i]:
                        res[i].add(w)
    return wordcount, res

def checkfile(filename):
    wordcount = 0
    res=[set() for _ in range(5)]

    with open(filename, encoding="utf8") as f:
        for line in f:

            # Get all words from the text line , using nltk
            line=line.lower()
            words = nltk.word_tokenize(line)

            # Check if any of the words correspond to the lists after stemming
            for w in words:
                if len(w)>1:
                    wordcount+=1
                    if w not in ignorelist:
                        w2=stemmer.stem(w)
                        for i in range(5):
                            if w2 in wordlist[i]:
                                res[i].add(w)
                                break
    return wordcount, res

def ignore(words:str):
    # Add a normalized word to the ignore list
    global ignorelist
    words=words.strip().lower().split()
    for w in words:
        if len(w)>1:
            ignorelist.add(w)

# Analyze results received from checktxt or checkfile
def evaluate(result, ignoreCategories=[False, False, False, False, False], printout=True, showWords=True):
    wordcount=result[0]
    words=result[1]
    counts = [len(words[i]) for i in range(5)]

    badcount=0

    for i in range(5):
        if not ignoreCategories[i]:
            badcount+=counts[i]

    if printout:
        if badcount==0:
            print("No inappropriate words found in the text.\n")
        else:
            print(f"{badcount} out of {wordcount} meaningful words ({int(badcount/wordcount*10000)/100} %) are inappropriate.\n")

            for i in range(5):
                if not ignoreCategories[i]:
                    print(f"{counts[i]} out of {wordcount} meaningful words ({int(counts[i]/wordcount*10000)/100} %) are in the category \"{categories[i]}\"")
                    if showWords and counts[i]>0:
                        print("Words:", end=" ")
                        for w in words[i]:
                            print(w, end=" ")
                        print()
                    print()
        print()

    # Returns True if text is inappropriate (based on words from non-ignored categories), False if not inappropriate
    if badcount==0: return False
    return True


In [None]:
# Basic CLI program
def main():
    global ignorelist
    ignoreCategories=[False, False, False, False, False]
    printout=True
    printWords=False
    greetingstxt="""
Please choose one of the following options, and enter the corresponding number:
    0: Quit
    1: Check raw text (single line)
    2: Check a file
    3: Change ignored categories
    4: Edit ignored word list
    5: Edit result verbosity settings
"""

    print("Welcome to inappropriate language checker!")
    cmd="1"

    while cmd!="0":
        print(greetingstxt)
        cmd=input().strip()
        if cmd=="1":
            print("Enter the text:")
            inp=input()
            print()
            if evaluate(checktxt(inp), ignoreCategories=ignoreCategories, printout=printout, showWords=printWords):
                print("Inappropriate")
            else:
                print("Not inappropriate")

        elif cmd=="2":
            print("Enter file name:")
            inp=input()
            print()
            try:
                if evaluate(checkfile(inp), ignoreCategories=ignoreCategories, printout=printout, showWords=printWords):
                    print("Inappropriate")
                else:
                    print("Not inappropriate")
            except:
                print("Could not open file ", inp)

        elif cmd=="3":
            print("\nCurrent category statuses:")
            for i in range(5):
                print(i, ": ", categories[i], end=": ")
                if ignoreCategories[i]:
                    print("ignored")
                else:
                    print("checked")
            print("\nEnter a number (0-4) to change the status of that category: ", end="")
            try:
                n=int(input())
                print()
                ignoreCategories[n] = not ignoreCategories[n]
                print(f"Category {n} changed!")
            except:
                print("Incorrect input. Failed to change the category.")

        elif cmd=="4":
            print("Enter words to add to ignore list (separated by spaces), or press Enter to reset the ignore list")
            inp=input()
            if len(inp)>0:
                ignore(inp)
                print("Words added to ignore list!")
            else:
                ignorelist=set()
                print("Ignore list reset!")

        elif cmd=="5":
            print(f"Current settings:\nBasic stats output on: {printout}\nFlagged word output on: {printWords}")
            print("Enter 1 to switch basic stats output, 2 to switch flagged word output")
            inp=input().strip()
            if inp=="1":
                printout = not printout
                print("Basic stats output status switched")
            elif inp=="2":
                printWords = not printWords
                print("Flagged words output status switched")
            else:
                print("Unrecognised command")

        elif cmd!="0":
            print("Unknown command\n")


In [None]:
main()