In [2]:
# Recruitment Pipeline Optimization for HR Analytics
# Implementing a Simplified Large Language Model (LLM) Simulation for Gemini API Key

# Sample Dataset (Text-Based for LLM Simulation)
data = [
    {"AIzaSyBBG4nNd_Bll0PvyZ7rgMTNJCZBomS_nI0": "abc123", "Access": "Full", "RateLimit": "1000/h", "Usage": "Production"},
    {"AIzaSyBBG4nNd_Bll0PvyZ7rgMTNJCZBomS_nI0": "def456", "Access": "Limited", "RateLimit": "500/h", "Usage": "Development"},
    {"AIzaSyBBG4nNd_Bll0PvyZ7rgMTNJCZBomS_nI0": "ghi789", "Access": "Full", "RateLimit": "Unlimited", "Usage": "Internal"},
    {"AIzaSyBBG4nNd_Bll0PvyZ7rgMTNJCZBomS_nI0": "jkl012", "Access": "None", "RateLimit": "0/h", "Usage": "Inactive"}
]

# Step 1: Text Preprocessing
# Convert the dataset into a textual format
def preprocess_data(data):
    text_data = []
    for record in data:
        text = f"API Key {record['AIzaSyBBG4nNd_Bll0PvyZ7rgMTNJCZBomS_nI0']} has {record['Access']} access, a rate limit of {record['RateLimit']}, and is used for {record['Usage']} purposes."
        text_data.append(text)
    return text_data

text_data = preprocess_data(data)

# Step 2: Tokenization
# Split text into words manually
def tokenize(text):
    tokens = []
    for sentence in text:
        tokens.append(sentence.split())
    return tokens

tokens = tokenize(text_data)

# Step 3: Create a Vocabulary
def build_vocabulary(tokens):
    vocabulary = {}
    index = 0
    for sentence in tokens:
        for word in sentence:
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1
    return vocabulary

vocabulary = build_vocabulary(tokens)

# Step 4: Encode Sentences into Numerical Format
def encode_sentences(tokens, vocabulary):
    encoded_sentences = []
    for sentence in tokens:
        encoded_sentence = [vocabulary[word] for word in sentence]
        encoded_sentences.append(encoded_sentence)
    return encoded_sentences

encoded_sentences = encode_sentences(tokens, vocabulary)

# Step 5: Implement a Simplified Language Model
# Use a bigram language model (probabilities of word pairs)
def build_bigram_model(encoded_sentences):
    bigram_counts = {}
    for sentence in encoded_sentences:
        for i in range(len(sentence) - 1):
            bigram = (sentence[i], sentence[i + 1])
            if bigram not in bigram_counts:
                bigram_counts[bigram] = 0
            bigram_counts[bigram] += 1

    total_bigrams = sum(bigram_counts.values())
    bigram_probabilities = {bigram: count / total_bigrams for bigram, count in bigram_counts.items()}
    return bigram_probabilities

bigram_model = build_bigram_model(encoded_sentences)

# Step 6: Generate Text Using the Model
def generate_text(vocabulary, bigram_model, start_word, num_words):
    reverse_vocab = {index: word for word, index in vocabulary.items()}
    current_word = vocabulary[start_word]
    generated_text = [start_word]

    for _ in range(num_words - 1):
        candidates = [(bigram[1], prob) for bigram, prob in bigram_model.items() if bigram[0] == current_word]
        if not candidates:
            break

        next_word_index = max(candidates, key=lambda x: x[1])[0]
        next_word = reverse_vocab[next_word_index]
        generated_text.append(next_word)
        current_word = next_word_index

    return ' '.join(generated_text)

# Generate text starting with "API"
start_word = "API"
generated_text = generate_text(vocabulary, bigram_model, start_word, 10)

# Output Results
print("Text Data:", text_data)
print("Vocabulary:", vocabulary)
print("Encoded Sentences:", encoded_sentences)
print("Bigram Probabilities:", bigram_model)
print("Generated Text:", generated_text)


Text Data: ['API Key abc123 has Full access, a rate limit of 1000/h, and is used for Production purposes.', 'API Key def456 has Limited access, a rate limit of 500/h, and is used for Development purposes.', 'API Key ghi789 has Full access, a rate limit of Unlimited, and is used for Internal purposes.', 'API Key jkl012 has None access, a rate limit of 0/h, and is used for Inactive purposes.']
Vocabulary: {'API': 0, 'Key': 1, 'abc123': 2, 'has': 3, 'Full': 4, 'access,': 5, 'a': 6, 'rate': 7, 'limit': 8, 'of': 9, '1000/h,': 10, 'and': 11, 'is': 12, 'used': 13, 'for': 14, 'Production': 15, 'purposes.': 16, 'def456': 17, 'Limited': 18, '500/h,': 19, 'Development': 20, 'ghi789': 21, 'Unlimited,': 22, 'Internal': 23, 'jkl012': 24, 'None': 25, '0/h,': 26, 'Inactive': 27}
Encoded Sentences: [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1, 17, 3, 18, 5, 6, 7, 8, 9, 19, 11, 12, 13, 14, 20, 16], [0, 1, 21, 3, 4, 5, 6, 7, 8, 9, 22, 11, 12, 13, 14, 23, 16], [0, 1, 24, 3, 25, 5, 6,