In [4]:
import kagglehub
import os
from collections import Counter
import string
import re

dataset_path = kagglehub.dataset_download("crawford/20-newsgroups")
print("Dataset downloaded to:", dataset_path)

Dataset downloaded to: /home/catalin/.cache/kagglehub/datasets/crawford/20-newsgroups/versions/1


In [5]:
files = os.listdir(dataset_path)
file_paths = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if os.path.isfile(os.path.join(dataset_path, f))]
print(files)


['misc.forsale.txt', 'comp.windows.x.txt', 'rec.sport.baseball.txt', 'talk.religion.misc.txt', 'sci.space.txt', 'talk.politics.mideast.txt', 'rec.sport.hockey.txt', 'comp.sys.mac.hardware.txt', 'talk.politics.guns.txt', 'comp.graphics.txt', 'comp.sys.ibm.pc.hardware.txt', 'talk.politics.misc.txt', 'comp.os.ms-windows.misc.txt', 'rec.motorcycles.txt', 'sci.electronics.txt', 'sci.med.txt', 'list.csv', 'alt.atheism.txt', 'rec.autos.txt', 'sci.crypt.txt', 'soc.religion.christian.txt']


In [6]:
def analyze_text_files_full_metrics_sorted(file_list):
    """
    Opens each file, counts words and visible characters, and returns
    individual character counts sorted by frequency.

    Args:
        file_list (list): A list of file paths.

    Returns:
        tuple: Total word count, total visible char count, and individual
               character counts sorted by frequency (descending).
    """
    all_character_counts = Counter()
    total_visible_char_count = 0
    total_word_count = 0
    visible_characters = set(string.printable) - set(string.whitespace)
    visible_characters.add(' ')

    for file_path in file_list:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                for char in content:
                    if char in visible_characters:
                        all_character_counts[char] += 1
                        total_visible_char_count += 1
                words = re.findall(r'\b\w+\b', content.lower())
                total_word_count += len(words)
        except UnicodeDecodeError:
            try:
                with open(file_path, 'r', encoding='latin-1') as file:
                    content = file.read()
                    for char in content:
                        if char in visible_characters:
                            all_character_counts[char] += 1
                            total_visible_char_count += 1
                    words = re.findall(r'\b\w+\b', content.lower())
                    total_word_count += len(words)
            except Exception as e:
                raise Exception(f"Error reading {file_path} with 'latin-1': {e}")
        except FileNotFoundError as e:
            raise FileNotFoundError(f"File not found: {file_path}") from e
        except Exception as e:
            raise Exception(f"Error reading {file_path}: {e}")

    # Sort the character counts by value (frequency) in descending order
    sorted_char_counts = sorted(all_character_counts.items(), key=lambda item: item[1], reverse=True)
    return total_word_count, total_visible_char_count, dict(sorted_char_counts)

In [7]:
try:
    word_count, total_char_count, individual_char_counts = analyze_text_files_full_metrics_sorted(file_paths)
    print(f"Total Word Count: {word_count}")
    print(f"Total Visible Character Count: {total_char_count}")
    print("\nIndividual Character Counts:")
    for char, count in individual_char_counts.items():
        print(f"Character '{char}': {count}")
except (FileNotFoundError, UnicodeDecodeError, Exception) as error:
    print(f"An error occurred: {error}")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/tmp/ipykernel_99689/734617743.py", line 22, in analyze_text_files_full_metrics_sorted
    content = file.read()
              ^^^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 11597: invalid start byte

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/catalin/dev/python/ErgoType/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3699, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_99689/2760208557.py", line 2, in <module>
    word_count, total_char_count, individual_char_counts = analyze_text_files_full_metrics_sorted(file_paths)
                                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_99689/734617743.py", line None, in analyze_text_files_full_metrics_sorted
K

In [None]:
def get_key_presses_detailed(char_counts):
    """
    Simulates key presses for each character and counts the occurrences of
    each key press combination (lowercase, SHIFT+key, SPACE).

    Args:
        char_counts (dict): A dictionary where keys are characters and values are their counts.

    Returns:
        dict: A dictionary where keys are the key press representations
              (e.g., 'a', 'SHIFT+A', 'SPACE') and values are their counts.
    """
    key_press_counts = Counter()

    for char, count in char_counts.items():
        if 'a' <= char <= 'z':
            key_press = char
        elif 'A' <= char <= 'Z':
            key_press = "SHIFT"
        elif char == ' ':
            key_press = "SPACE"
        elif char in "!@#$%^&*()_+{}|:\"<>?~`":
            key_press = "SHIFT"
        elif char in "[]\\;',./-=":
            key_press = char
        elif char.isdigit():
            key_press = char
        else:
            key_press = f"OTHER+{char}"  # Handle characters not on standard keyboard

        key_press_counts[key_press] += count

    return dict(key_press_counts)

In [None]:
key_presses = get_key_presses_detailed(individual_char_counts)

In [None]:
for key, count in key_presses.items():
    print(f"Key: {key:>5}, Percentage: {(count/total_char_count):.5f}, Percentage (no SPACE,SHIFT): {(count/(total_char_count-key_presses['SPACE']-key_presses['SHIFT'])):.5f}")

In [None]:
print(key_presses)


In [None]:
import matplotlib.pyplot as plt

# Remove 'SPACE' and 'SHIFT'
filtered = {k: v for k, v in key_presses.items() if k not in ('SPACE', 'SHIFT')}

# Convert to percentage
total = sum(filtered.values())
percentages = {k: (v / total) * 100 for k, v in filtered.items()}

# Sort and select top 20
sorted_items = sorted(percentages.items(), key=lambda x: x[1], reverse=True)[:10]
keys, values = zip(*sorted_items)

# Plot
plt.figure(figsize=(12, 6))
plt.bar(keys, values, color='steelblue')
plt.title('Top 20 cele mai frecvente taste apăsate (%)')
plt.xlabel('Tastă')
plt.ylabel('Frecvență (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
sorted_items