In [1]:
import nltk
from nltk import FreqDist
from nltk.util import ngrams
nltk.download('punkt')

# Sample text
text = "I like to eat apples. Apples are delicious fruits. I like bananas too."

# Tokenize the text
tokens = nltk.word_tokenize(text)

# Define the context window size
context_window = 3

# Create a vocabulary
vocab = set(tokens)

# Initialize the occurrence matrix
occurrence_matrix = [[0] * len(vocab) for _ in range(len(vocab))]

# Create word pairs within the context window and update the matrix
for i in range(len(tokens)):
    target_word = tokens[i]
    for j in range(max(0, i - context_window), min(len(tokens), i + context_window + 1)):
        if i != j:
            context_word = tokens[j]
            occurrence_matrix[list(vocab).index(target_word)][list(vocab).index(context_word)] += 1

# Print the occurrence matrix
for row in occurrence_matrix:
    print(row)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[0, 1, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0]
[1, 0, 1, 0, 0, 1, 1, 2, 1, 0, 1, 1]
[0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1]
[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0]
[0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1]
[1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]
[0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]
[1, 2, 0, 1, 0, 1, 1, 0, 2, 0, 1, 1]
[2, 1, 2, 1, 1, 1, 1, 2, 0, 2, 1, 1]
[0, 0, 1, 1, 1, 0, 0, 0, 2, 0, 0, 1]
[0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0]
[0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0]


In [2]:
len(tokens)

16

In [3]:
import nltk
nltk.download('punkt')

# Open the text file in read mode
file_path = "/content/drive/MyDrive/new/corpus_for_language_models.txt"


lines_as_lists = []

try:
    with open(file_path, 'r') as file:
        for line in file:
            # Tokenize the line using word_punct_tokenize
            tokens =  nltk.word_tokenize(line)
            # Append the list of tokens to the lines_as_lists
            lines_as_lists.append(tokens)
except FileNotFoundError:
    print(f"The file '{file_path}' does not exist.")
except IOError as e:
    print(f"An error occurred while reading the file: {e}")

# Print the list of lists
for line_list in lines_as_lists:
    print(line_list)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Richard', 'W.', 'Lock', ',', 'retired', 'vice', 'president', 'and', 'treasurer', 'of', 'Owens-Illinois', 'Inc.', ',', 'was', 'named', 'a', 'director', 'of', 'this', 'transportation', 'industry', 'supplier', ',', 'increasing', 'its', 'board', 'to', 'six', 'members', '.']
['John', 'J.', 'Phelan', 'Jr.', ',', 'chairman', 'of', 'the', 'Big', 'Board', ',', 'asserts', 'that', '``', '1988', 'and', '1989', 'have', 'been', 'two', 'of', 'the', 'least', 'volatile', 'years', 'in', 'the', 'last', '30', 'or', '40', 'years', '.', '``']
['In', 'January', ',', 'American', 'Medical', 'brought', 'in', 'a', 'new', 'chief', 'executive', 'officer', ',', 'Richard', 'A.', 'Gilleland', ',', '45', ',', 'who', 'will', 'remain', 'as', 'chairman', ',', 'president', 'and', 'chief', 'executive', '.']
['Ralph', 'T.', 'Linsley', ',', 'vice', 'chairman', 'of', 'Eagle', ',', 'will', 'become', 'vice', 'chairman', 'of', 'Webster\\/Eagle', '.']
['Orkem', 'declined', 'to', 'give', 'details', 'of', 'its', 'offer', ',', 'sa

In [4]:
lines_as_lists

[['Richard',
  'W.',
  'Lock',
  ',',
  'retired',
  'vice',
  'president',
  'and',
  'treasurer',
  'of',
  'Owens-Illinois',
  'Inc.',
  ',',
  'was',
  'named',
  'a',
  'director',
  'of',
  'this',
  'transportation',
  'industry',
  'supplier',
  ',',
  'increasing',
  'its',
  'board',
  'to',
  'six',
  'members',
  '.'],
 ['John',
  'J.',
  'Phelan',
  'Jr.',
  ',',
  'chairman',
  'of',
  'the',
  'Big',
  'Board',
  ',',
  'asserts',
  'that',
  '``',
  '1988',
  'and',
  '1989',
  'have',
  'been',
  'two',
  'of',
  'the',
  'least',
  'volatile',
  'years',
  'in',
  'the',
  'last',
  '30',
  'or',
  '40',
  'years',
  '.',
  '``'],
 ['In',
  'January',
  ',',
  'American',
  'Medical',
  'brought',
  'in',
  'a',
  'new',
  'chief',
  'executive',
  'officer',
  ',',
  'Richard',
  'A.',
  'Gilleland',
  ',',
  '45',
  ',',
  'who',
  'will',
  'remain',
  'as',
  'chairman',
  ',',
  'president',
  'and',
  'chief',
  'executive',
  '.'],
 ['Ralph',
  'T.',
  'Linsley

In [5]:
def get_output(input_list):
  """Converts a list of lists of strings to a list of strings, with each string containing all the elements from the corresponding lists in the input list, separated by a space.

  Args:
    input_list: A list of lists of strings.

  Returns:
    A list of strings, with each string containing all the elements from the corresponding lists in the input list, separated by a space.
  """

  output_list = []
  for sublist in input_list:
    output_string = " ".join(sublist)
    output_list.append(output_string)
  return output_list

out_put_list=get_output(lines_as_lists)
out_put_list


['Richard W. Lock , retired vice president and treasurer of Owens-Illinois Inc. , was named a director of this transportation industry supplier , increasing its board to six members .',
 'John J. Phelan Jr. , chairman of the Big Board , asserts that `` 1988 and 1989 have been two of the least volatile years in the last 30 or 40 years . ``',
 'In January , American Medical brought in a new chief executive officer , Richard A. Gilleland , 45 , who will remain as chairman , president and chief executive .',
 'Ralph T. Linsley , vice chairman of Eagle , will become vice chairman of Webster\\/Eagle .',
 'Orkem declined to give details of its offer , saying only that the bid will be submitted for approval by the board of the British company .',
 'But in 1935 , when Congress was trying to find someone or something to blame for the Great Depression , it decided to drop both the secretary and the comptroller from the board .',
 'Treasury Undersecretary David Mulford , for instance , was at a me

In [6]:
def find_vocab(input_list):
  """Finds all the unique vocab in a list of lists of strings.

  Args:
    input_list: A list of lists of strings.

  Returns:
    A set of all the unique vocab in the input list.
  """

  vocab_set = set()
  for sublist in input_list:
    for word in sublist:
      vocab_set.add(word)
  return vocab_set
vocab_set = find_vocab(lines_as_lists)
print(len(vocab_set))
print(vocab_set)

5605
{'Embittered', 'meets', 'Polsky', 'narrows', 'courthouses', 'tomorrow', 'Abalkin', 'Colgate-Palmolive', 'took', 'relationship', 'Cie.', 'spectator', 'Countered', 'Donald', 'Banc', 'widow', 'Tom', 'photo', 'shape', 'bass', 'Andersson', 'Alun-Jones', 'shareholding', "n't", 'hottest', 'Panny', 'Epinalers', 'clear', 'Kiep', 'succeeds', 'Lyons', 'spending', 'stability', 'process', 'tacked', 'output', '3.19', 'reflect', 'academia', 'expressed', 'Us', 'affiliates', 'elsewhere', 'Motley', 'rooms', 'official', 'reminder', 'careening', 'average', 'job', 'Valley', 'headed', 'Drabinsky', 'affect', 'conservative', 'attorney', 'away', 'City', 'Clarke', 'requests', 'republic', 'Whitten', 'silent', 'Commission', 'magistrates', 'Carlyle', 'Nov.', 'appropriations', 'Mehl', 'value', 'certificate-of-need', 'impervious', 'Beall', 'explaining', 'Ely', 'March', 'Domenici', 'inverse', 'Boone', 'Holewinski', 'providing', 'radio', 'Brady', 'forecasts', 'Popular', 'Results', 'conflict-of-interest', 'threate

In [8]:
import pandas as pd
from tqdm import tqdm

def co_occurance_matrix(input_text, top_words, window_size):
    co_occur = pd.DataFrame(index=top_words, columns=top_words)

    for row, nrow in tqdm(zip(top_words, range(len(top_words))), total=len(top_words), desc="Building Matrix"):
        for colm, ncolm in zip(top_words, range(len(top_words))):
            count = 0
            if row == colm:
                co_occur.iloc[nrow, ncolm] = count
            else:
                for single_essay in input_text:
                    essay_split = single_essay.split(" ")
                    max_len = len(essay_split)
                    top_word_index = [index for index, split in enumerate(essay_split) if row in split]
                    for index in top_word_index:
                        if index == 0:
                            count = count + essay_split[:window_size + 1].count(colm)
                        elif index == (max_len - 1):
                            count = count + essay_split[-(window_size + 1):].count(colm)
                        else:
                            count = count + essay_split[index + 1: (index + window_size + 1)].count(colm)
                            if index < window_size:
                                count = count + essay_split[:index].count(colm)
                            else:
                                count = count + essay_split[(index - window_size): index].count(colm)
                co_occur.iloc[nrow, ncolm] = count

    return co_occur



In [None]:

window_size =5

result = co_occurance_matrix(out_put_list,list(vocab_set),window_size)
result

Building Matrix:   8%|▊         | 472/5605 [3:26:41<37:08:52, 26.05s/it]

In [None]:
import numpy as np
import math
def ppmi(word,context_word,df):
  num_prob=df[word,context_word]/df.sum(axis='both')
  denum_prob=(df[word,:]/df.sum(axis='both'))*(df[context_word,:]/df.sum(axis='both'))
  pmi = np.log2(num_prob/denum_prob)
  ppmi = np.maximum(pmi, 0)

  return ppmi

In [None]:
round(ppmi('sales','president',result),2)

In [None]:
round(ppmi('sales','said',result),2)

In [None]:
round(ppmi('company','president',result),2)

In [None]:
round(ppmi('company','of',result),2)

In [None]:
def cosine_similairity_with_context(context_list,word1,word2,df):
  vector_word1=[]
  vector_word2=[]
  for context in context_list:
    vec1=ppmi(word1,context,df)
    vec2=ppmi(word2,context,df)
    vector_word1.append(vec1)
    vector_word2.append(vec2)
  # Convert the lists to NumPy arrays
  v1_array = np.array(vector_word1)
  v2_array = np.array(vector_word2)

  # Calculate the dot product of the two arrays
  dot_product = np.dot(v1_array, v2_array)

  # Calculate the magnitudes of the two arrays
  magnitude_v1 = np.linalg.norm(v1_array)
  magnitude_v2 = np.linalg.norm(v2_array)

  # Calculate the cosine similarity
  cosine_similarity = dot_product / (magnitude_v1 * magnitude_v2)
  return cosine_similarity


In [None]:
context_list=['said','of','president']
round(cosine_similairity_with_context(context_list,'executive','company',result),2)

In [None]:
round(cosine_similairity_with_context(context_list,'sales','purchase',result),2)

In [None]:
round(cosine_similairity_with_context(context_list,'executive','sales',result),2)