In [20]:
import numpy
import re
import random
import math
from collections import defaultdict, deque, Counter

# text generation

In [2]:
def read_text_file(file_path) :
  with open(file_path, 'r') as file :
    text = file.read()
    return text

In [3]:
def preprocess_text(text) :
  text = text.lower()
  text = re.sub(r'[^a-zA-Z0-9.,!?;\'"]', ' ', text)
  text = re.sub(r'([.,!?;])', r' \1', text)
  text = re.sub(r'\s+', ' ', text)
  words = text.split()
  return words

In [4]:
def build_transition_matrix(words, n) :
  transition_matrix = defaultdict(list)
  ngram_queue = deque(maxlen = n)

  for word in words :
    if len(ngram_queue) == n :
      ngram = tuple(ngram_queue)
      transition_matrix[ngram].append(word)
    ngram_queue.append(word)

  return transition_matrix

In [5]:
def generate_text(transition_matrix, start_words, taille = 50) :
# Choose a random start ngram based on its probability (weighted choice)
    n = len(start_words)
    current_words = deque(start_words, maxlen = n)
    text = list(start_words)
    for _ in range(taille - n) :
      current_ngram = tuple(current_words)
      if current_ngram in transition_matrix :
        next_word = random.choice(transition_matrix[current_ngram])
        text.append(next_word)
        current_words.append(next_word)
      else :
        break
    return ' '.join(text)

In [6]:
text = read_text_file('/content/exemple.txt')
text

'\n\nTo Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. He never spoke of the softer passions, save with a gibe and a sneer. They were admirable things for the observer--excellent for drawing the veil from men\'s motives and actions. But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results. Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be more disturbing than a stro

In [7]:
n = 2
words = preprocess_text(text)
transition_matrix = build_transition_matrix(words, n)
print(list(transition_matrix.items())[:10])

[(('to', 'sherlock'), ['holmes']), (('sherlock', 'holmes'), ['she']), (('holmes', 'she'), ['is']), (('she', 'is'), ['always', 'incorrigible', 'herself']), (('is', 'always'), ['the']), (('always', 'the'), ['woman']), (('the', 'woman'), ['.']), (('woman', '.'), ['i']), (('.', 'i'), ['have', 'had', 'rang', 'am', 'may', 'understand', 'have', 'was', 'am', 'know', 'shall']), (('i', 'have'), ['seldom', 'changed', 'both', 'just', 'come', 'heard', 'one'])]


In [8]:
start_words = random.choice(list(list(transition_matrix.keys())))
generated_text = generate_text(transition_matrix, start_words = start_words)
print(generated_text)

continental gazetteer ." he looked from one to me ," returned the king reproachfully . "very , indeed , if there is nothing else ." "i was aware that i was addressing wilhelm gottsreich sigismond von ormstein , grand duke of cassel felstein , and that you intended to go


# text classification

In [9]:
data = {
    "sports" : [
        "the team won the match",
        "the player scored a goal",
        "the match was exciting"
    ],
    "politics" : [
        "the elections results were announced",
        "the president gave a speech",
        "the government passed a new law"
    ]
}

In [10]:
def preprocess_text(texts) :
  for text in texts :
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9.,!?;\'"]', ' ', text)
    text = re.sub(r'([^\w\s]])', ' ', text)
    text = re.sub(r'([.,!?;])', r' \1', text)
    return texts.split()

In [11]:
new_data = defaultdict(list)
for category, texts in data.items() :
  for text in texts :
    text = preprocess_text(text)
    new_data[category].append(text)
    print(text)

['the', 'team', 'won', 'the', 'match']
['the', 'player', 'scored', 'a', 'goal']
['the', 'match', 'was', 'exciting']
['the', 'elections', 'results', 'were', 'announced']
['the', 'president', 'gave', 'a', 'speech']
['the', 'government', 'passed', 'a', 'new', 'law']


In [12]:
def build_another_transition_matrix(texts) :
  transition_matrix = defaultdict(Counter)
  for text in texts :
    for i in range(len(text) - 1) :
      transition_matrix[text[i]][text[i+1]] += 1

  for word, transitions in transition_matrix.items() :
    total = sum(transitions.values())
    for next_word in transitions :
      transitions[next_word] /= total

  return transition_matrix

In [18]:
list(new_data.values())[0]

[['the', 'team', 'won', 'the', 'match'],
 ['the', 'player', 'scored', 'a', 'goal'],
 ['the', 'match', 'was', 'exciting']]

In [19]:
transition_matrix = {}
for category, texts in data.items() :
  transition_matrix[category] = build_another_transition_matrix(texts)
transition_matrix

{'sports': defaultdict(collections.Counter,
             {'t': Counter({'h': 0.5, 'e': 0.125, 'c': 0.25, 'i': 0.125}),
              'h': Counter({'e': 0.8, ' ': 0.2}),
              'e': Counter({' ': 0.5,
                       'a': 0.125,
                       'r': 0.125,
                       'd': 0.125,
                       'x': 0.125}),
              ' ': Counter({'t': 0.18181818181818182,
                       'w': 0.18181818181818182,
                       'm': 0.18181818181818182,
                       'p': 0.09090909090909091,
                       's': 0.09090909090909091,
                       'a': 0.09090909090909091,
                       'g': 0.09090909090909091,
                       'e': 0.09090909090909091}),
              'a': Counter({'m': 0.14285714285714285,
                       't': 0.2857142857142857,
                       'y': 0.14285714285714285,
                       ' ': 0.14285714285714285,
                       'l': 0.14285714285714285,
   

In [24]:
def calculate_likelihood(text, transition_matrix) :
  tokens = preprocess_text(text)
  if len(tokens) < 2 :
    return float("-inf")
  likelihood = 0
  for i in range(len(tokens) - 1) :
    current_word, next_word = tokens[i], tokens[i+1]
    if current_word in transition_matrix and next_word in transition_matrix[current_word] :
      likelihood += math.log(transition_matrix[current_word][next_word])
    else :
      likelihood += math.log(1e-6)
    return likelihood

In [25]:
def classify_text(text, transition_matrix) :
  likelihoods = {}
  for category, matrix in transition_matrix.items() :
    likelihoods[category] = calculate_likelihood(text, matrix)
  return max(likelihoods, key= likelihoods.get)

In [30]:
new_text = "government"
classify_category = classify_text(new_text, transition_matrix)
print(classify_category)

sports
