In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv("../data/prepared_data_for_classification.csv")
y = (df['views'] > 200).astype(int)
print(y.value_counts())
feature_columns = [
    'duration',
    'frame rate',
    'likes',
    'comments',
    'pixel_count',
    'title_len',
    'uppercase_ratio',
    'emoji_count',
    'has_exclamation',
    # Бинарные признаки категории
    'category_Howto & Style',
    'category_Music',
    'category_News & Politics',
    'category_Nonprofits & Activis',
    'category_People & Blogs',
    'category_Pets & Animals',
    'category_Science & Technology',
    'category_Shows',
    'category_Sports',
    'category_Travel & Events',
    # Бинарные признаки наличия текста
    'description',
    'hashtags'
]

X = df[feature_columns]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred = model_knn.predict(X_test)
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
labels = ['unpopular', 'popular']  # Non-Cat = 0, Cat = 1
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Визуализация
plt.figure(figsize=(6,4))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', linewidths=0.5, linecolor='black')
plt.title('KNN — Confusion Matrix')
plt.xlabel('Actual Class')
plt.ylabel('Predicted Class')
plt.show()
y_proba = model_knn.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_proba))