<a href="https://colab.research.google.com/github/arashkhgit/DataScience-cheat-sheet/blob/main/NLP_Tokenized_pure_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import re

# Dictionary of stop words for different languages
stop_words = {
    "english": [
        "a", "an", "and", "the", "in", "on", "at", "of", "for", "to", "is", "are", "was", "were", "it", "that", "this"
        # Add more English stop words as needed
    ],
    "spanish": [
        "un", "una", "y", "en", "el", "la", "los", "de", "para", "a", "es", "son", "fue", "fueron", "eso", "este"
        # Add more Spanish stop words as needed
    ]
    # Add stop words for other languages here
}

def tokenize_text(text):
    # Use regular expression to split the text into words (tokens)
    # This regex pattern will consider words with letters, digits, and underscores as valid tokens
    return re.findall(r'\w+', text.lower())

def analyze_text(text, language="english", output_file="output.txt"):
    global word_occurrences
    tokens = tokenize_text(text)
    word_occurrences = {}
    total_word_length = 0

    for word in tokens:
        # Count occurrences and calculate word length
        word_occurrences[word] = word_occurrences.get(word, 0) + 1
        total_word_length += len(word)

    # Sort the word_occurrences dictionary based on occurrences, word length, and alphabetically
    sorted_occurrences = sorted(word_occurrences.items(), key=lambda item: (-item[1], len(item[0]), item[0]))

    with open(output_file, "w") as file:
        # Write the table with histogram to the file
        file.write("{:<15} {:<10} {:<10} {:<30}\n".format("Word", "Length", "Occurrences", "Histogram"))
        file.write("=" * 55 + "\n")
        for word, occurrences in sorted_occurrences:
            word_length = len(word)
            histogram = "*" * occurrences
            file.write("{:<15} {:<10} {:<10} {:<30}\n".format(word, word_length, occurrences, histogram))

        # Perform statistical analysis
        num_unique_words = len(word_occurrences)
        average_word_length = total_word_length / len(tokens)
        max_word_length = max(len(word) for word in tokens)
        min_word_length = min(len(word) for word in tokens)
        mode_word = sorted_occurrences[0][0]
        least_word = sorted_occurrences[-1][0]

        # Write the statistics to the file
        file.write("\nStatistical Analysis:\n")
        file.write("Number of Unique Words: {}\n".format(num_unique_words))
        file.write("Average Length of Words: {:.2f}\n".format(average_word_length))
        file.write("Maximum Length of Words: {}\n".format(max_word_length))
        file.write("Minimum Length of Words: {}\n".format(min_word_length))
        file.write("Mode of Words: {}\n".format(mode_word))
        file.write("Least Occurring Word: {}\n".format(least_word))

        # Analyze stop words
        language = language.lower()
        if language in stop_words:
            stop_word_occurrences = {word: word_occurrences.get(word, 0) for word in stop_words[language]}
        else:
            file.write(f"\nStop words not available for the '{language}' language.\n")
            return

        # Write the table for stop words to the file
        separator = "=" * 25
        file.write(f"\n{separator} Stop Words for {language.capitalize()} {separator}\n")
        file.write("{:<15} {:<10}\n".format("Stop Word", "Occurrences"))
        file.write("=" * 25 + "\n")
        for word, occurrences in stop_word_occurrences.items():
            file.write("{:<15} {:<10}\n".format(word, occurrences))

    # Read and print the content of the output file
    with open(output_file, "r") as file:
        output_content = file.read()
        print(output_content)

def get_user_choice(message, options):
    while True:
        user_input = input(message).lower()
        if user_input in options:
            return user_input
        else:
            print("Invalid input. Please choose a valid option.")

def get_user_input(message, variable_type):
    while True:
        try:
            user_input = input(message)
            if variable_type == int:
                return int(user_input)
            elif variable_type == str:
                return user_input
        except ValueError:
            print("Invalid input. Please enter a valid value.")

if __name__ == "__main__":
    file_path = "/content/file.txt"
    output_file = "output.txt"
    try:
        with open(file_path, "r") as file:
            file_contents = file.read()
            # Get user input for the desired language
            language = get_user_choice("Choose a language (english/spanish): ", ["english", "spanish"])

            # Call analyze_text with the chosen language
            analyze_text(file_contents, language=language, output_file=output_file)

            # Get user input for the user's choice (word length or occurrences)
            choice = get_user_choice("Choose your search option (word_length/occurrences): ", ["word_length", "occurrences"])

            if choice == "word_length":
                target_word_length = get_user_input("Enter the desired word length: ", int)
                # Find words matching the given word length in word_occurrences
                matching_words = [word for word, occurrences in word_occurrences.items() if len(word) == target_word_length]
            else:
                target_occurrences = get_user_input("Enter the desired number of occurrences: ", int)
                # Find words matching the given number of occurrences in word_occurrences
                matching_words = [word for word, occurrences in word_occurrences.items() if occurrences == target_occurrences]

            # Display the matching words
            if matching_words:
                print("Matching words:")
                for word in matching_words:
                    print(word)
            else:
                print("No words match the given criteria.")

        print("Analysis completed. Output saved to:", output_file)

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except IOError:
        print(f"Error reading the file: {file_path}")


Choose a language (english/spanish): english
Word            Length     Occurrences Histogram                     
is              2          2          **                            
my              2          2          **                            
a               1          1          *                             
hi              2          1          *                             
for             3          1          *                             
nlp             3          1          *                             
file            4          1          *                             
name            4          1          *                             
task            4          1          *                             
text            4          1          *                             
this            4          1          *                             
arash           5          1          *                             
sample          6          1          *                  