In [2]:
# - Доработать фильтры и стоп-слова 
# - Добавить логирование переданных текстов и отправлять в БД
# - Добавить обьединение текстов в один блок(опционально) 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pymorphy2
import langdetect
from langdetect import detect

# Загрузка стоп-слов для английского языка
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Определяет язык текста
def detect_language(text):
    try:
        language = detect(text)
        if language == "en":
            language = "english"
        elif language == "ru":
            language = "russian"
        return language
    except:
        return "unknown"

def lemmatize_text(text, language='english'):
    if language == 'russian':
        morph = pymorphy2.MorphAnalyzer()
        lemmatized_words = [morph.parse(word)[0].normal_form for word in text.split()]
        return ' '.join(lemmatized_words)
    elif language == 'english':
        tokens = word_tokenize(text)
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(lemmatized_words)
    else:
        er = "Unsupported language. Only 'english' and 'russian' are supported. You tryed it in " + language 
        raise ValueError(er)

def clean_text(text):
    # Токенизация текста
    tokens = word_tokenize(text)
    # Загрузка списка стоп-слов для английского языка
    english_stop_words = set(stopwords.words('english'))
    # Удаление стоп-слов и знаков пунктуации
    filtered_tokens = [word for word in tokens if word.isalnum() and word.lower() not in english_stop_words]
    # Объединение токенов обратно в текст
    clean_text = ' '.join(filtered_tokens)
    return clean_text

def preprocess_text(text, language='english'):
    # Лемматизация текста
    lemmatized_text = lemmatize_text(text, language)
    # Очистка текста от стоп-слов и знаков пунктуации
    cleaned_text = clean_text(lemmatized_text)
    return cleaned_text

# Пример текста для обработки
input_text1 = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
concerned with the interactions between computers and human language, in particular how to program computers to
process and analyze large amounts of natural language data.
"""
input_text2 = """
Columbine High School shootings, massacre that occurred on April 20, 1999, at Columbine High School 
in Littleton, Colorado, leaving 15 dead, including the two students responsible for the attack. 
It was one of the deadliest school shooting incidents in American history.
The shootings were carried out by Eric Harris, age 18, and Dylan Klebold, age 17. 
On April 20, 1999, they entered Columbine High School in Jefferson county with semiautomatic rifles, 
pistols, and several explosives. In less than 20 minutes they killed 12 fellow students and a teacher 
and wounded 21 others. The violence came to an end when Harris and Klebold took their own lives. 
Officials later found two propane tank bombs in the cafeteria; had they detonated, the death toll 
would have been much higher.

The massacre at Columbine on April 20, 1999, during which 12 students and one teacher were killed, 
wasn’t the United States' first mass shooting at a school, nor would it be the last. But media 
experts told USA TODAY it quickly became one of the most infamous thanks in part to the advent 
of the 24-hour news cycle and the internet. In what felt like real time, the shooting sent shock 
waves through the Colorado community and the nation, shattering the belief that children were safe 
at school.

The Columbine High School massacre, commonly referred to as Columbine, was a school shooting and 
attempted bombing that occurred on April 20, 1999, at Columbine High School in Columbine, Colorado, 
United States.[b] The perpetrators, twelfth-grade students Eric Harris and Dylan Klebold, murdered 
twelve students and one teacher.

The Columbine massacre is more than merely notorious. In the decades since teenagers 
Eric Harris and Dylan Klebold shot 13 people dead at their school, the event has inspired reams 
of journalism, endless internet debates, starry-eyed fan fiction, and even a tasteless computer game.
To cut through the noise, myths, and misinformation, it’s useful to probe some key questions about 
what happened at Columbine High School on 20th April 1999.
"""

# input_text3 = """
# Le 15 octobre, comme tous les matins, à 7 heures précises, Pierre a ouvert les yeux. A côté de lui, Claire, sa femme, dort encore. Il a fait les gestes
# de tous les matins : il s'est levé, il a ouvert les rideaux, il a regardé par la fenêtre et il a préparé le café, puis il s'est dirigé vers la salle de bains. Sa toilette
# achevée1
# , il a jeté un dernier coup d'oeil au miroir qui lui a renvoyé l'image de
# Pierre Roulin2, caissier à la banque de Paris, 46 ans, marié, père de deux enfants. Tout à coup, il s'approche du miroir. Là, à gauche, au-dessus du front,
# des cheveux blancs. Pierre ne bouge plus. Vieux, il est vieux.
# — Tu n'écoutes pas les nouvelles?
# Claire est là, debout derrière lui et le regarde.
# """

# input_text4 = """ 
# """



#Определение языка, вывод языка, , обработка текста, вывод текста обработанного на Russian
language = detect_language(input_text1)
print(language,end="\n")
output_text1 = preprocess_text(input_text1, language)
print(output_text1,end="\n")

#Определение языка, вывод языка, , обработка текста, вывод текста обработанного на English
language = detect_language(input_text2)
print(language,end="\n")
output_text2 = preprocess_text(input_text2, language)
print(output_text2,end="\n)")

#Определение языка, вывод языка, , обработка текста, вывод текста обработанного на неподдерживаемом Языке
# language = detect_language(input_text3)
# print(language,end="\n")
# output_text3 = preprocess_text(input_text3, language)
# print(output_text3,end="\n)")

#Определение языка, вывод языка, , обработка текста, вывод текста обработанного но пустого
# language = detect_language(input_text4)
# print(language,end="\n")
# output_text4 = preprocess_text(input_text4, language)
# print(output_text4,end="\n)")


#ФУнкция трансформации
#output_text = preprocess_text(input_text3, language='english')

english
Natural language processing NLP subfield linguistics computer science artificial intelligence concerned interaction computer human language particular program computer process analyze large amount natural language data
english
Columbine High School shooting massacre occurred April 20 1999 Columbine High School Littleton Colorado leaving 15 dead including two student responsible attack wa one deadliest school shooting incident American history shooting carried Eric Harris age 18 Dylan Klebold age 17 April 20 1999 entered Columbine High School Jefferson county semiautomatic rifle pistol several explosive le 20 minute killed 12 fellow student teacher wounded 21 others violence came end Harris Klebold took life Officials later found two propane tank bomb cafeteria detonated death toll would much higher massacre Columbine April 20 1999 12 student one teacher killed United States first mass shooting school would last medium expert told USA TODAY quickly became one infamous thanks par

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Artem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Artem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
