In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams
from collections import Counter

In [3]:
file_path = 'lab4_text.txt'
with open(file_path, 'r') as file:
    data = file.read()
data

"The movie was fantastic! The acting was superb and the plot was engaging.\nI found the movie to be quite disappointing. The story was weak and the characters were not convincing.\nThis movie is a must-watch. The direction was brilliant and the performances were top-notch.\nI couldn't get enough of this film. The cinematography was stunning and the soundtrack was mesmerizing."

In [4]:
data = data.lower()  
data = ''.join(char for char in data if char.isalpha() or char.isspace())  

In [5]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform([data])

In [6]:
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.toarray()[0]

In [7]:
tfidf_dict = {feature_names[i]: tfidf_values[i] for i in range(len(feature_names))}
print("\nTF-IDF Values:")
tfidf_dict


TF-IDF Values:


{'acting': 0.18257418583505536,
 'brilliant': 0.18257418583505536,
 'characters': 0.18257418583505536,
 'cinematography': 0.18257418583505536,
 'convincing': 0.18257418583505536,
 'direction': 0.18257418583505536,
 'disappointing': 0.18257418583505536,
 'engaging': 0.18257418583505536,
 'fantastic': 0.18257418583505536,
 'film': 0.18257418583505536,
 'mesmerizing': 0.18257418583505536,
 'movie': 0.5477225575051661,
 'mustwatch': 0.18257418583505536,
 'performances': 0.18257418583505536,
 'plot': 0.18257418583505536,
 'quite': 0.18257418583505536,
 'soundtrack': 0.18257418583505536,
 'story': 0.18257418583505536,
 'stunning': 0.18257418583505536,
 'superb': 0.18257418583505536,
 'topnotch': 0.18257418583505536,
 'weak': 0.18257418583505536}

In [8]:
tokens = data.split()
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

In [9]:
unigram_prob = Counter(unigrams)
bigram_prob = Counter(bigrams)
trigram_prob = Counter(trigrams)

In [10]:
total_unigrams = len(unigrams)
total_bigrams = len(bigrams)
total_trigrams = len(trigrams)

In [11]:
unigram_prob = {k: v / total_unigrams for k, v in unigram_prob.items()}
bigram_prob = {k: v / total_bigrams for k, v in bigram_prob.items()}
trigram_prob = {k: v / total_trigrams for k, v in trigram_prob.items()}

In [12]:
print("Unigram Probabilities:")
unigram_prob

Unigram Probabilities:


{('the',): 0.16393442622950818,
 ('movie',): 0.04918032786885246,
 ('was',): 0.11475409836065574,
 ('fantastic',): 0.01639344262295082,
 ('acting',): 0.01639344262295082,
 ('superb',): 0.01639344262295082,
 ('and',): 0.06557377049180328,
 ('plot',): 0.01639344262295082,
 ('engaging',): 0.01639344262295082,
 ('i',): 0.03278688524590164,
 ('found',): 0.01639344262295082,
 ('to',): 0.01639344262295082,
 ('be',): 0.01639344262295082,
 ('quite',): 0.01639344262295082,
 ('disappointing',): 0.01639344262295082,
 ('story',): 0.01639344262295082,
 ('weak',): 0.01639344262295082,
 ('characters',): 0.01639344262295082,
 ('were',): 0.03278688524590164,
 ('not',): 0.01639344262295082,
 ('convincing',): 0.01639344262295082,
 ('this',): 0.03278688524590164,
 ('is',): 0.01639344262295082,
 ('a',): 0.01639344262295082,
 ('mustwatch',): 0.01639344262295082,
 ('direction',): 0.01639344262295082,
 ('brilliant',): 0.01639344262295082,
 ('performances',): 0.01639344262295082,
 ('topnotch',): 0.0163934426229

In [13]:
print("Bigram Probabilities:")
bigram_prob

Bigram Probabilities:


{('the', 'movie'): 0.03333333333333333,
 ('movie', 'was'): 0.016666666666666666,
 ('was', 'fantastic'): 0.016666666666666666,
 ('fantastic', 'the'): 0.016666666666666666,
 ('the', 'acting'): 0.016666666666666666,
 ('acting', 'was'): 0.016666666666666666,
 ('was', 'superb'): 0.016666666666666666,
 ('superb', 'and'): 0.016666666666666666,
 ('and', 'the'): 0.06666666666666667,
 ('the', 'plot'): 0.016666666666666666,
 ('plot', 'was'): 0.016666666666666666,
 ('was', 'engaging'): 0.016666666666666666,
 ('engaging', 'i'): 0.016666666666666666,
 ('i', 'found'): 0.016666666666666666,
 ('found', 'the'): 0.016666666666666666,
 ('movie', 'to'): 0.016666666666666666,
 ('to', 'be'): 0.016666666666666666,
 ('be', 'quite'): 0.016666666666666666,
 ('quite', 'disappointing'): 0.016666666666666666,
 ('disappointing', 'the'): 0.016666666666666666,
 ('the', 'story'): 0.016666666666666666,
 ('story', 'was'): 0.016666666666666666,
 ('was', 'weak'): 0.016666666666666666,
 ('weak', 'and'): 0.016666666666666666

In [14]:
print("Trigram Probabilities:")
trigram_prob

Trigram Probabilities:


{('the', 'movie', 'was'): 0.01694915254237288,
 ('movie', 'was', 'fantastic'): 0.01694915254237288,
 ('was', 'fantastic', 'the'): 0.01694915254237288,
 ('fantastic', 'the', 'acting'): 0.01694915254237288,
 ('the', 'acting', 'was'): 0.01694915254237288,
 ('acting', 'was', 'superb'): 0.01694915254237288,
 ('was', 'superb', 'and'): 0.01694915254237288,
 ('superb', 'and', 'the'): 0.01694915254237288,
 ('and', 'the', 'plot'): 0.01694915254237288,
 ('the', 'plot', 'was'): 0.01694915254237288,
 ('plot', 'was', 'engaging'): 0.01694915254237288,
 ('was', 'engaging', 'i'): 0.01694915254237288,
 ('engaging', 'i', 'found'): 0.01694915254237288,
 ('i', 'found', 'the'): 0.01694915254237288,
 ('found', 'the', 'movie'): 0.01694915254237288,
 ('the', 'movie', 'to'): 0.01694915254237288,
 ('movie', 'to', 'be'): 0.01694915254237288,
 ('to', 'be', 'quite'): 0.01694915254237288,
 ('be', 'quite', 'disappointing'): 0.01694915254237288,
 ('quite', 'disappointing', 'the'): 0.01694915254237288,
 ('disappointing