In [66]:
import re
import string
import numpy as np
import pandas as pd
import pickle
import nltk
nltk.download('popular', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import clear_output

def preprocess_text(text):
    text = text.lower()  # Приведение текста к нижнему регистру
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)  # Удаление markdown-ссылок
    text = re.sub(r'@\w+', '', text)  # Удаление упоминаний (handle)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Удаление ссылок
    text = re.sub(r'<.*?>+', '', text)  # Удаление HTML-тегов
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Удаление знаков препинания
    text = re.sub(r'\n', '', text)  # Удаление символов новой строки
    text = re.sub(r'\w*\d\w*', '', text)  # Удаление слов с цифрами
    return text.strip()  # Удаление лишних пробелов в начале и конце текста

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize(tokens):
    pos_tagged = nltk.pos_tag(tokens) 
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else: 
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return ' '.join(lemmatized_sentence)

model = pickle.load(open('model.pkl', 'rb'))
statement = input()
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = lemmatize(tokens)
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
tfidf_tokens = vectorizer.transform([lemmatized_tokens])
additional_features = np.array([len(statement), len([i for i in lemmatized_tokens if i in ['i','ive','im','i`m','i`ve']])])
X = hstack([tfidf_tokens, additional_features])
labels = ['Anxiety', 'Bipolar', 'Depression', 'Normal', 'Personality disorder', 'Stress', 'Suicidal']
print('Предсказанный диагноз - ' + labels[model.predict(X)[0]])

he trip began with a long flight from the mainland. Despite the length of the journey, I was excited to finally arrive in Hawaii. As soon as I stepped off the plane, I was greeted with a warm and humid climate that was a welcome change from the cold winter weather back home.  One of the first things we did upon arriving was visit the famous Waikiki Beach. The sand was soft and white, and the water was crystal clear. We spent hours swimming and soaking up the sun. We also tried our hand at surfing, which was a new and thrilling experience for me.  In addition to beach activities, we also took the time to explore the island. We hiked through the lush forests, visited the stunning waterfalls, and even drove up to the top of the island to see the breathtaking views.
Предсказанный диагноз - Stress
