<a href="https://colab.research.google.com/github/CodeHunterOfficial/ABC_DataMining/blob/main/NV/%D0%90%D0%BD%D0%B0%D0%BB%D0%B8%D0%B7_%D0%B8_%D0%BE%D0%B1%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D0%BA%D0%B0_%D1%81%D0%BE%D1%86%D0%B8%D0%B0%D0%BB%D1%8C%D0%BD%D1%8B%D1%85_%D0%BC%D0%B5%D0%B4%D0%B8%D0%B0_%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%BE%D0%B2_%D0%B4%D0%BB%D1%8F_%D0%B2%D1%8B%D1%8F%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D1%8F_%D1%82%D0%B5%D0%BD%D0%B4%D0%B5%D0%BD%D1%86%D0%B8%D0%B9_%D0%B8_%D0%BD%D0%B0%D1%81%D1%82%D1%80%D0%BE%D0%B5%D0%BD%D0%B8%D0%B9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Импортируем необходимые библиотеки
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from transformers import pipeline

# Скачиваем необходимые ресурсы NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Загрузка данных https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment/data
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)

# Предобработка текста
def preprocess_text(text):
    # Удаление ссылок, хэштегов, упоминаний и специальных символов
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Оставляем только буквы
    text = text.lower()  # Преобразуем в нижний регистр

    # Токенизация, удаление стоп-слов и лемматизация
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

# Применяем предобработку к столбцу "text"
airline_tweets['clean_lemmatized_text'] = airline_tweets['text'].apply(preprocess_text)

# Векторизация текста с помощью TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(airline_tweets['clean_lemmatized_text']).toarray()

# Кодирование меток классов
y = airline_tweets['airline_sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Массив моделей
models = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', random_state=42))
]

# Функция для оценки модели
def evaluate_model(model, name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Визуализация матрицы ошибок
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
                xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix ({name})')
    plt.show()

# Функция для построения графиков обучения
def plot_learning_curve(model, X, y, name):
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training Accuracy', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean, label='Validation Accuracy', color='green', marker='s')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.15, color='green')

    plt.title(f'Learning Curve ({name})')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

# Тематическое моделирование (LDA)
def topic_modeling():
    tfidf_vectorizer = TfidfVectorizer(max_features=3000)
    X_tfidf = tfidf_vectorizer.fit_transform(airline_tweets['clean_lemmatized_text'])

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(X_tfidf)

    def display_topics(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(f"Topic #{topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

# Анализ трендов во времени
def sentiment_trends_over_time():
    airline_tweets['tweet_created'] = pd.to_datetime(airline_tweets['tweet_created'])
    sentiment_over_time = airline_tweets.groupby([pd.Grouper(key='tweet_created', freq='D'), 'airline_sentiment']).size().unstack(fill_value=0)

    plt.figure(figsize=(12, 6))
    for sentiment in ['positive', 'neutral', 'negative']:
        plt.plot(sentiment_over_time.index, sentiment_over_time[sentiment], label=sentiment.capitalize())

    plt.title('Sentiment Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Tweets')
    plt.legend()
    plt.show()

# Сравнение тональности между авиакомпаниями
def sentiment_by_airline():
    sentiment_by_airline = airline_tweets.groupby(['airline', 'airline_sentiment']).size().unstack(fill_value=0)
    sentiment_by_airline = sentiment_by_airline.div(sentiment_by_airline.sum(axis=1), axis=0)

    sentiment_by_airline.plot(kind='bar', stacked=True, figsize=(12, 6))
    plt.title('Sentiment Distribution by Airline')
    plt.xlabel('Airline')
    plt.ylabel('Proportion of Sentiment')
    plt.show()

# Интерактивная визуализация
def interactive_visualization():
    fig = px.bar(sentiment_by_airline.reset_index(), x='airline', y=['positive', 'neutral', 'negative'],
                 title='Sentiment Distribution by Airline', labels={'value': 'Proportion'})
    fig.show()

# Тестирование на новых данных
def predict_sentiment(new_texts, vectorizer, model):
    new_texts_preprocessed = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts_preprocessed).toarray()
    predictions = model.predict(new_texts_vectorized)
    sentiment_map = {2: 'positive', 1: 'neutral', 0: 'negative'}
    return [sentiment_map[pred] for pred in predictions]

# Основной блок кода
if __name__ == "__main__":
    # Оценка каждой модели
    for name, model in models:
        print(f"Evaluating {name}...")
        evaluate_model(model, name, X_train, X_test, y_train, y_test)
        plot_learning_curve(model, X_train, y_train, name)

    # Тематическое моделирование
    print("\nTopic Modeling (LDA):")
    topic_modeling()

    # Анализ трендов во времени
    print("\nSentiment Trends Over Time:")
    sentiment_trends_over_time()

    # Сравнение тональности между авиакомпаниями
    print("\nSentiment Distribution by Airline:")
    sentiment_by_airline()

    # Интерактивная визуализация
    print("\nInteractive Visualization:")
    interactive_visualization()

    # Тестирование на новых данных
    new_tweets = [
        "I love the service provided by Virgin America!",
        "The flight was delayed and the staff was rude.",
        "The seats were comfortable but the food was terrible."
    ]
    svm_model = models[3][1]  # SVM модель
    predicted_sentiments = predict_sentiment(new_tweets, vectorizer, svm_model)
    print("\nPredicted Sentiments for New Tweets:")
    for tweet, sentiment in zip(new_tweets, predicted_sentiments):
        print(f"Tweet: \"{tweet}\" → Sentiment: {sentiment}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


HTTPError: HTTP Error 404: Not Found