<a href="https://colab.research.google.com/github/Altaieb-Mohammed/lab_2corse/blob/master/lab10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

# 1. Загрузка датасета
url = "https://raw.githubusercontent.com/Altaieb-Mohammed/lab_2corse/master/clean_synthetic_dataset.csv"
df = pd.read_csv(url)

# 2. Очистка текста (оставляем буквы, цифры и пробелы)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

df['text_clean'] = df['text'].apply(preprocess_text)

# 3. Входные данные и метки
X_text = df['text_clean']
X_num = df[['estate_worth', 'debts', 'funeral_expenses', 'bequests']]
y = df['label']

# 4. Делим на обучающую и тестовую выборки
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42, stratify=y)

# 5. Векторизация текста TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.85, min_df=2)
X_text_train_tfidf = vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = vectorizer.transform(X_text_test)

# 6. Масштабирование числовых признаков
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

# 7. Объединяем текстовые и числовые признаки
X_train_combined = hstack([X_text_train_tfidf, X_num_train_scaled])
X_test_combined = hstack([X_text_test_tfidf, X_num_test_scaled])

# 8. Обучение модели SVM с балансировкой классов
model = LinearSVC(class_weight='balanced', random_state=42, max_iter=5000)
model.fit(X_train_combined, y_train)

# 9. Предсказания и оценка
y_pred = model.predict(X_test_combined)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['Low Bequest', 'High Bequest']))


Accuracy: 0.9950
Classification Report:
               precision    recall  f1-score   support

 Low Bequest       0.99      1.00      1.00       100
High Bequest       1.00      0.99      0.99       100

    accuracy                           0.99       200
   macro avg       1.00      0.99      0.99       200
weighted avg       1.00      0.99      0.99       200

