In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [71]:
reviews = [
    "This movie was amazing, I loved it! Great actors and an exciting plot.",
    "Not bad, but the pacing was a bit slow. Some good moments, though.",
    "Worst movie ever! Waste of time, terrible acting, and a weak script.",
    "Absolutely fantastic! A must-see for any fan of action movies.",
    "It was okay, but I expected more. The plot was predictable.",
    "A beautiful story, really moving. I will definitely watch it again.",
    "Horrible! The direction was poor, and the film was way too long.",
    "An incredible journey. Stunning visuals and deep character development.",
    "Fun and entertaining, perfect for a family movie night.",
    "Boring and unoriginal, I had to leave halfway through.",
    "The best movie I have seen in years, brilliant direction and acting.",
    "Terrible, a complete disaster from start to finish.",
    "A very heartwarming film, great performances all around.",
    "Good, but the ending was too rushed. Could have been better.",
    "Action-packed, thrilling, and non-stop excitement from start to finish.",
    "Too many clichés. Not worth the hype.",
    "An intense drama, well worth the watch for those who enjoy serious films.",
    "Very funny! I loved every minute of it.",
    "A bit too long, but overall a good movie with great performances.",
    "Great for kids, lots of fun and adventure.",
    "The plot was great, but the acting didn't live up to expectations.",
    "A perfect mix of comedy and action, highly recommend.",
    "Dark, twisted, and filled with unexpected surprises.",
    "I loved the soundtrack, but the story was lacking.",
    "An emotional rollercoaster, I cried several times."
]

In [72]:
stop_words = list(ENGLISH_STOP_WORDS)

In [73]:
vectorizer = TfidfVectorizer(stop_words=stop_words)

In [74]:
X = vectorizer.fit_transform(reviews)

In [75]:
vectorizer_ngrams = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))

In [76]:
X_ngrams = vectorizer_ngrams.fit_transform(reviews)

In [77]:
n_topics = 5  
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=25, random_state=42)
lda.fit(X_ngrams)

In [78]:
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_top_words(lda, vectorizer_ngrams.get_feature_names_out())

Topic #0:
great performances great performances story good heartwarming film great film great performances heartwarming film film great heartwarming
Topic #1:
plot direction movie plot predictable okay expected expected plot expected plot predictable okay okay expected plot predictable
Topic #2:
perfect mix comedy perfect mix comedy recommend comedy action highly comedy action action highly recommend highly recommend mix comedy mix
Topic #3:
worth fun clichés worth hype hype worth hype clichés worth clichés boring unoriginal leave boring unoriginal boring
Topic #4:
funny funny loved minute funny loved minute loved minute loved terrible complete terrible complete disaster complete disaster start disaster start


In [79]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))),
    ('regressor', LogisticRegression())
])

In [80]:
from sklearn.model_selection import train_test_split

labels = [1]*12 + [0]*13

X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.3, random_state=42)
pipeline.fit(X_train, y_train)
print(f"Обучающая выборка: {len(X_train)}")
print(f"Тестовая выборка: {len(X_test)}")


Обучающая выборка: 17
Тестовая выборка: 8


In [81]:
pipeline.fit(X_train, y_train)

In [82]:
y_pred = pipeline.predict(X_test)

В данном примере мы выделили 5 тем в отзывах о фильмах с помощью LDA. Каждая тема представляет собой набор слов,
которые часто встречаются вместе в текстах отзывов. Эти темы могут быть интерпретированы как категории или стили фильмов,
такие как "драма", "комедия", "боевик", и т.д. Также мы создали pipeline для классификации отзывов, где использовали
n-граммы и логистическую регрессию для анализа. Результаты классификации могут помочь понять, как различные темы
соотносятся с позитивными или негативными отзывами.