Sentence-transformers

In [None]:
import re
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# === Пример 1: NER (сегментация по сущностям) ===
print("=== Пример 1: NER ===")
nlp_ner = pipeline("ner", model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")
text = "Apple was founded by Steve Jobs. It is located in Cupertino."
entities = nlp_ner(text)
for ent in entities:
    print(ent)

# === Пример 2: Summarization (сегментация на логические блоки) ===
print("\n=== Пример 2: Summarization ===")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text_sum = "Machine learning is a field of artificial intelligence. It allows systems to learn from data. Natural language processing is used to understand human language."
sentences = re.split(r'[.!?]+', text_sum)
sentences = [s.strip() for s in sentences if s.strip()]
for i, sent in enumerate(sentences):
    if len(sent) > 20:
        summary = summarizer(sent, max_length=20, min_length=5, do_sample=False)
        print(f"Блок {i+1}: {summary[0]['summary_text']}")

# === Пример 3: Кластеризация по смыслу (sentence transformers) ===
print("\n=== Пример 3: Кластеризация по смыслу ===")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
text_cluster = "I love machine learning. It's fascinating how algorithms work. I also enjoy hiking and spending time outdoors."
sentences = re.split(r'[.!?]+', text_cluster)
sentences = [s.strip() for s in sentences if s.strip()]
embeddings = model.encode(sentences)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(embeddings)
for i, sent in enumerate(sentences):
    print(f"Кластер {clusters[i]}: {sent}")

# === Пример 4: Zero-shot classification (сегментация по темам) ===
print("\n=== Пример 4: Zero-shot classification ===")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = ["technology", "personal life", "work"]
text_zs = "I love machine learning. Nature is my escape from work."
sentences = re.split(r'[.!?]+', text_zs)
sentences = [s.strip() for s in sentences if s.strip()]
for sent in sentences:
    if len(sent) > 10:
        result = classifier(sent, candidate_labels)
        print(f"Текст: {sent}")
        print(f"Тема: {result['labels'][0]}, Вероятность: {result['scores'][0]:.2f}")

In [None]:
import spacy
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint

# Загрузка модели spaCy
nlp = spacy.load("en_core_web_sm")

# Пример текста (например, транскрипт интервью)
text = """
    I love machine learning. It's fascinating how algorithms work.
    On the other hand, coding can be exhausting sometimes.
    I also enjoy hiking and spending time outdoors.
    Deep learning models are very powerful.
    But debugging code is not fun.
    Nature is my escape from work.
"""

# Шаг 1: Разделение на предложения
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

# Шаг 2: Предобработка текста (токенизация, удаление стоп-слов и лемматизация)
def preprocess(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc
            if not token.is_stop and not token.is_punct and token.is_alpha]

processed = [preprocess(sent) for sent in sentences]

# Шаг 3: Создание словаря и корпуса
dictionary = corpora.Dictionary(processed)
corpus = [dictionary.doc2bow(text) for text in processed]

# Шаг 4: Обучение LDA модели
num_topics = 2
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=10,
    random_state=42
)

# Шаг 5: Присвоение тем каждому предложению
topics_per_sent = [lda_model[corpus[i]] for i in range(len(corpus))]

# Вывод
for i, sent in enumerate(sentences):
    print(f"Предложение: {sent}")
    print(f"Темы: {topics_per_sent[i]}")
    print("-" * 40)

TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Пример данных
texts = [
    "я люблю машинное обучение",
    "модели глубокого обучения потрясают",
    "я не люблю писать код",
    "программирование — это весело",
    "нейросети меняют мир",
    "не могу понять, как работает TF-IDF"
]
labels = [1, 1, 0, 1, 1, 0]  # 1 — позитивный тон, 0 — негативный

# Векторизация текста
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words=None,  # можно указать 'english' или кастомные стоп-слова
    lowercase=True,
    ngram_range=(1, 2)
)
X = vectorizer.fit_transform(texts)

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Обучение модели
model = LogisticRegression()
model.fit(X_train, y_train)

# Предсказание и оценка
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Преобразование в датафрейм (опционально)
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df.head())