# Building a Character-Level Language Model for Hangman Game
## Introduction

### Steps Overview
### Provide an overview of the key steps involved in building the language model.

## Dataset
### Describe the dataset used for training the model. Mention the source of the Hangman game dataset and the number of words available (e.g., 25,000 words).

## Data Preprocessing
### Explain the preprocessing steps performed on the dataset before training the model:

### Generating input sequences: Describe how input sequences were created from each word, where each sequence contains characters from the beginning of the word up to the previous-to-last ### character.
### Generating target characters: Explain how the target characters were derived from the last character of each input sequence.

### Character-to-Index Mapping
#### Details the creation of a character-to-index mapping:

## Data Splitting

#### Describing the train-test split with a specified ratio (e.g., 80% training, 20% testing).
#### Model Architecture
#### Provide an overview of the model architecture chosen for the language model:

In [1]:
### Importing the required libraries 

import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from sklearn.model_selection import train_test_split

### Importing the file that contains the words to train the model 

In [2]:

word_file_path = "words_250000_train.txt" 
with open(word_file_path, 'r') as file:
    words = [line.strip() for line in file]

In [3]:
len(words)

227300

In [4]:
import random
secret_word = random.choice(words)

In [5]:
guessed_letters = []

max_attempts = 6
attempts = 0

current_state = "_" * len(secret_word)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [7]:
import random
import time
import re
import collections

In [8]:
!pip install requests





In [9]:
from hangman_game import HangmanAPI

In [10]:
import time
import collections
import re
import requests
from urllib.parse import parse_qs

### Creating the HangmanAPI error Class

In [11]:
class HangmanAPIError(Exception):
    def __init__(self, result):
        self.result = result
        self.code = None
        try:
            self.type = result["error_code"]
        except (KeyError, TypeError):
            self.type = ""

        try:
            self.message = result["error_description"]
        except (KeyError, TypeError):
            try:
                self.message = result["error"]["message"]
                self.code = result["error"].get("code")
                if not self.type:
                    self.type = result["error"].get("type", "")
            except (KeyError, TypeError):
                try:
                    self.message = result["error_msg"]
                except (KeyError, TypeError):
                    self.message = result

        Exception.__init__(self, self.message)


### Creating a Hangman API class

In [12]:
class HangmanAPI(object):
    def __init__(self, access_token=None, session=None, timeout=None):
        self.hangman_url = self.determine_hangman_url()
        self.access_token = access_token
        self.session = session or requests.Session()
        self.timeout = timeout
        self.guessed_letters = []
        
        full_dictionary_location = "words_250000_train.txt"
        self.full_dictionary = self.build_dictionary(full_dictionary_location)        
        self.full_dictionary_common_letter_sorted = collections.Counter("".join(self.full_dictionary)).most_common()
        
        self.current_dictionary = []
        
    @staticmethod
    def determine_hangman_url():
        links = ['https://trexsim.com', 'https://sg.trexsim.com']

        data = {link: 0 for link in links}

        for link in links:
            requests.get(link)
            for i in range(10):
                s = time.time()
                requests.get(link)
                data[link] = time.time() - s

        link = sorted(data.items(), key=lambda x: x[1])[0][0]
        link += '/trexsim/hangman'
        return link
    
    def guess(self, word):
        clean_word = word[::2].replace("_", ".")
        len_word = len(clean_word)

        new_dictionary = [dict_word for dict_word in self.current_dictionary if len(dict_word) == len_word and re.match(clean_word, dict_word)]
        self.current_dictionary = new_dictionary

        full_dict_string = "".join(new_dictionary)
        c = collections.Counter(full_dict_string)
        sorted_letter_count = c.most_common()

        guess_letter = '!'
        for letter, instance_count in sorted_letter_count:
            if letter not in self.guessed_letters:
                guess_letter = letter
                break

        if guess_letter == '!':
            sorted_letter_count = self.full_dictionary_common_letter_sorted
            for letter, instance_count in sorted_letter_count:
                if letter not in self.guessed_letters:
                    guess_letter = letter
                    break

        return guess_letter

    def build_dictionary(self, dictionary_file_location):
        with open(dictionary_file_location, "r") as text_file:
            full_dictionary = text_file.read().splitlines()
        return full_dictionary

    def start_game(self, practice=True, verbose=True):
        self.guessed_letters = []
        self.current_dictionary = self.full_dictionary

        response = self.request("/new_game", {"practice": practice})
        if response.get('status') == "approved":
            game_id = response.get('game_id')
            word = response.get('word')
            tries_remains = response.get('tries_remains')
            if verbose:
                print("Successfully start a new game! Game ID: {0}. # of tries remaining: {1}. Word: {2}.".format(game_id, tries_remains, word))
            while tries_remains > 0:
                guess_letter = self.guess(word)
                self.guessed_letters.append(guess_letter)
                if verbose:
                    print("Guessing letter: {0}".format(guess_letter))
                try:
                    res = self.request("/guess_letter", {"request": "guess_letter", "game_id": game_id, "letter": guess_letter})
                except HangmanAPIError:
                    print('HangmanAPIError exception caught on request.')
                    continue
                except Exception as e:
                    print('Other exception caught on request.')
                    raise e
                if verbose:
                    print("Server response: {0}".format(res))
                status = res.get('status')
                tries_remains = res.get('tries_remains')
                if status == "success":
                    if verbose:
                        print("Successfully finished game: {0}".format(game_id))
                    return True
                elif status == "failed":
                    reason = res.get('reason', '# of tries exceeded!')
                    if verbose:
                        print("Failed game: {0}. Because of: {1}".format(game_id, reason))
                    return False
                elif status == "ongoing":
                    word = res.get('word')
        else:
            if verbose:
                print("Failed to start a new game")
        return status == "success"


In [13]:
import tensorflow as tf

### Creating input sequences and corresponding target characters

In [14]:
input_sequences = []
target_characters = []

for word in words:
    for i in range(len(word) - 1):
        input_sequences.append(word[:i+1])
        target_characters.append(word[i+1])

### Creating a mapping of characters to numerical indices

In [18]:
char_to_idx = {char: idx for idx, char in enumerate(set(target_characters))}
num_classes = len(char_to_idx)

### Converting input sequences to numerical format

In [20]:
max_seq_length = max(len(seq) for seq in input_sequences)
X = np.zeros((len(input_sequences), max_seq_length), dtype=np.int32)
for i, seq in enumerate(input_sequences):
    for j, char in enumerate(seq):
        X[i, j] = char_to_idx[char]

### Convert target characters to numerical labels

In [21]:
y = np.array([char_to_idx[char] for char in target_characters])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### After data split creating an ANN model

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_classes, 64, input_length=max_seq_length),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

### Compiling the model

In [27]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 28, 64)            1664      
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense_1 (Dense)             (None, 26)                1690      
                                                                 
Total params: 36,378
Trainable params: 36,378
Non-trainable params: 0
_________________________________________________________________


### Model Traning

In [29]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2387b8edcc0>

In [37]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", accuracy)

Test accuracy: 0.33261746168136597
