# Model Test Notebook - Orijinal Pipeline ile Test
## Halil Melih AKÇA 221104091

Bu notebook, eğitilmiş ensemble modelini orijinal pipeline'ı kullanarak test eder.

## Gerekli Kütüphaneleri Import Et

In [6]:
# ORIJINAL NOTEBOOK'TAKİ AYNI IMPORT'LAR
import pandas as pd
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
import re
import string
import nltk
import spacy
import pickle
import warnings
warnings.filterwarnings('ignore')

# spaCy model yükleme
try:
    nlp = spacy.load("en_core_web_sm")
    print(" spaCy model yüklendi")
except:
    print(" spaCy model bulunamadı")
    nlp = None

# Diğer gerekli kütüphaneler
from textstat import flesch_reading_ease, automated_readability_index
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

print("\n Tüm kütüphaneler yüklendi!")

 spaCy model yüklendi

 Tüm kütüphaneler yüklendi!


## Veri Yükleme ve Model Yükleme

In [7]:

try:
    news_df = pd.read_csv("../stockMarket_predict/Combined_News_DJIA.csv")
    print(f" Combined_News_DJIA.csv yüklendi: {news_df.shape}")
    data_source = "Combined_News_DJIA.csv"
except FileNotFoundError:
    try:
        news_df = pd.read_csv("../stockMarket_predict/upload_DJIA_table.csv")
        print(f" upload_DJIA_table.csv yüklendi: {news_df.shape}")
        data_source = "upload_DJIA_table.csv"
    except FileNotFoundError:
        print(" Hiçbir veri dosyası bulunamadı!")
        exit()

print(f"\nVeri kaynağı: {data_source}")
print(f"Veri boyutu: {news_df.shape}")
print(f"Sütunlar: {list(news_df.columns)[:10]}...")  

if 'Label' in news_df.columns:
    print(f"\n Label Dağılımı:")
    print(news_df['Label'].value_counts())
else:
    print(" Label sütunu bulunamadı!")

models_loaded = {}
try:
    with open("ensemble_model.pkl", "rb") as f:
        ensemble_model = pickle.load(f)
    models_loaded['ensemble'] = True
    print("\n Ensemble model yüklendi")
except FileNotFoundError:
    ensemble_model = None
    models_loaded['ensemble'] = False
    print("\n Ensemble model bulunamadı!")

preprocessors = ['scaler', 'poly', 'pca', 'tfidf']
loaded_preprocessors = {}

for prep in preprocessors:
    try:
        with open(f"{prep}.pkl", "rb") as f:
            loaded_preprocessors[prep] = pickle.load(f)
        models_loaded[prep] = True
        print(f" {prep} yüklendi")
    except FileNotFoundError:
        loaded_preprocessors[prep] = None
        models_loaded[prep] = False
        print(f" {prep} bulunamadı")

print(f"\n Yüklenen bileşenler: {sum(models_loaded.values())}/{len(models_loaded)}")

 Combined_News_DJIA.csv yüklendi: (1989, 27)

Veri kaynağı: Combined_News_DJIA.csv
Veri boyutu: (1989, 27)
Sütunlar: ['Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8']...

 Label Dağılımı:
Label
1    1065
0     924
Name: count, dtype: int64

 Ensemble model yüklendi
 scaler yüklendi
 poly yüklendi
 pca yüklendi
 tfidf yüklendi

 Yüklenen bileşenler: 5/5


## Orijinal Feature Engineering Fonksiyonları

In [8]:

def pos_features_spacy(text):
    """spaCy ile POS tag özellikleri"""
    if nlp is None:
        return [0.25, 0.25, 0.25, 0.25]  
    doc = nlp(text)
    total = len(doc)
    if total == 0:
        return [0, 0, 0, 0]
    
    noun_ratio = len([token for token in doc if token.pos_ == "NOUN"]) / total
    verb_ratio = len([token for token in doc if token.pos_ == "VERB"]) / total
    adj_ratio = len([token for token in doc if token.pos_ == "ADJ"]) / total
    adv_ratio = len([token for token in doc if token.pos_ == "ADV"]) / total
    return [noun_ratio, verb_ratio, adj_ratio, adv_ratio]

def clean_text(text):
    """Orijinal metin temizleme fonksiyonu"""
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r"b['\"]|['\"]", "", text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\d+', 'NUMBER', text)
    text = re.sub(r'[^\w\s.,!?;:]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def linguistic_features(text):
    """Linguistik özellikler çıkarma"""
    words = text.split()
    avg_word_len = np.mean([len(w) for w in words]) if words else 0
    punct_count = sum([1 for c in text if c in string.punctuation])
    cap_ratio = sum([1 for c in text if c.isupper()]) / (len(text) + 1e-9)
    digit_ratio = sum([1 for c in text if c.isdigit()]) / (len(text) + 1e-9)
    flesch = flesch_reading_ease(text)
    ari = automated_readability_index(text)
    return [len(words), avg_word_len, punct_count, cap_ratio, digit_ratio, flesch, ari]

financial_keywords = ["bull", "bear", "gain", "loss", "stock", "market"]

def financial_keyword_density(text):
    """Finansal anahtar kelime yoğunluğu"""
    tokens = text.lower().split()
    return [tokens.count(word)/len(tokens) if len(tokens) > 0 else 0 for word in financial_keywords]

def ner_features(text):
    """Named Entity Recognition özellikleri"""
    if nlp is None:
        return [0, 0, 0, 0]
    
    doc = nlp(text)
    counts = {"PERSON":0, "ORG":0, "GPE":0, "MONEY":0}
    for ent in doc.ents:
        if ent.label_ in counts:
            counts[ent.label_] += 1
    return list(counts.values())

def pos_features(text):
    """NLTK ile POS tag özellikleri (orijinal implementation)"""
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    total = len(tags)
    pos_counts = {"NN":0, "VB":0, "JJ":0, "RB":0}
    for _, tag in tags:
        if tag.startswith("NN"): pos_counts["NN"] += 1
        elif tag.startswith("VB"): pos_counts["VB"] += 1
        elif tag.startswith("JJ"): pos_counts["JJ"] += 1
        elif tag.startswith("RB"): pos_counts["RB"] += 1
    return [pos_counts["NN"]/total if total else 0, pos_counts["VB"]/total if total else 0, pos_counts["JJ"]/total if total else 0, pos_counts["RB"]/total if total else 0]

print(" Tüm feature engineering fonksiyonları tanımlandı!")

 Tüm feature engineering fonksiyonları tanımlandı!


## Veri Ön İşleme - Orijinal Pipeline

In [9]:
print("=== VERİ ÖN İŞLEME - ORİJİNAL PİPELİNE ===")

news_df['Combined'] = news_df.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
news_df['Cleaned'] = news_df['Combined'].apply(clean_text)

print(f"✓ {len(news_df)} örnek işlendi")
print(f"✓ Ortalama metin uzunluğu: {news_df['Cleaned'].str.len().mean():.1f} karakter")

print("\n İlk 3 temizlenmiş metin örneği:")
for i in range(min(3, len(news_df))):
    text = news_df['Cleaned'].iloc[i][:100] + "..." if len(news_df['Cleaned'].iloc[i]) > 100 else news_df['Cleaned'].iloc[i]
    label = news_df['Label'].iloc[i] if 'Label' in news_df.columns else "?"
    print(f"  {i+1}. Label: {label} | Text: {text}")

=== VERİ ÖN İŞLEME - ORİJİNAL PİPELİNE ===
✓ 1989 örnek işlendi
✓ Ortalama metin uzunluğu: 2764.4 karakter

 İlk 3 temizlenmiş metin örneği:
  1. Label: 0 | Text: georgia downs two russian warplanes as countries move to brink of war breaking: musharraf to be impe...
  2. Label: 1 | Text: why wont america and nato help us? if they wont help us now, why did we help them in iraq? bush puts...
  3. Label: 0 | Text: remember that adorable NUMBERyearold who sang at the opening ceremonies? that was fake, too. russia ...


## Feature Extraction - Orijinal Sıralama

In [10]:
print("=== FEATURE EXTRACTION - ORİJİNAL SIRALAMA ===")

print(" Adım 2a: Linguistik özellikler çıkarılıyor...")
ling_df = pd.DataFrame(news_df['Cleaned'].apply(linguistic_features).tolist(), 
                      columns=["word_count", "avg_word_len", "punct_count", 
                              "cap_ratio", "digit_ratio", "flesch", "ari"])
print(f"   {ling_df.shape[1]} linguistik özellik çıkarıldı")

print(" Adım 2b: Semantik özellikler çıkarılıyor...")
sia = SentimentIntensityAnalyzer()
sentiment_df = news_df['Cleaned'].apply(lambda x: pd.Series(sia.polarity_scores(x)))
print(f"   {sentiment_df.shape[1]} sentiment özellik çıkarıldı")

fin_kw_df = pd.DataFrame(news_df['Cleaned'].apply(financial_keyword_density).tolist(), 
                        columns=[f'kw_{k}' for k in financial_keywords])
print(f"   {fin_kw_df.shape[1]} finansal anahtar kelime özellik çıkarıldı")

ner_df = pd.DataFrame(news_df['Cleaned'].apply(ner_features).tolist(), 
                     columns=["PERSON", "ORG", "GPE", "MONEY"])
print(f"   {ner_df.shape[1]} NER özellik çıkarıldı")

print(" Adım 2c: POS tag özellikleri çıkarılıyor...")
pos_df = pd.DataFrame(news_df['Cleaned'].apply(pos_features_spacy).tolist(), 
                     columns=["noun_ratio", "verb_ratio", "adj_ratio", "adv_ratio"])
print(f"  ✓ {pos_df.shape[1]} POS özellik çıkarıldı")

print(" Adım 2d: TF-IDF özellikleri çıkarılıyor...")
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), min_df=2, max_df=0.95, stop_words='english')
tfidf_matrix = tfidf.fit_transform(news_df['Cleaned'])
pca = PCA(n_components=50)
tfidf_pca = pca.fit_transform(tfidf_matrix.toarray())
tfidf_df = pd.DataFrame(tfidf_pca, columns=[f'pca_{i}' for i in range(tfidf_pca.shape[1])])
print(f"   {tfidf_df.shape[1]} TF-IDF+PCA özellik çıkarıldı")

features = pd.concat([ling_df, sentiment_df, pos_df, tfidf_df], axis=1)
labels = news_df['Label']

print(f"\n Toplam {features.shape[1]} özellik çıkarıldı (orijinal sıralama ile)!")
print(f" Feature matrix boyutu: {features.shape}")
print(f" Labels boyutu: {labels.shape}")

print("\n Feature İstatistikleri:")
print(f"  - Linguistik: {ling_df.shape[1]} özellik")
print(f"  - Sentiment: {sentiment_df.shape[1]} özellik")
print(f"  - POS Tags: {pos_df.shape[1]} özellik")
print(f"  - TF-IDF+PCA: {tfidf_df.shape[1]} özellik")
print(f"  - Toplam: {features.shape[1]} özellik")






=== FEATURE EXTRACTION - ORİJİNAL SIRALAMA ===
 Adım 2a: Linguistik özellikler çıkarılıyor...
   7 linguistik özellik çıkarıldı
 Adım 2b: Semantik özellikler çıkarılıyor...
   4 sentiment özellik çıkarıldı
   6 finansal anahtar kelime özellik çıkarıldı
   4 NER özellik çıkarıldı
 Adım 2c: POS tag özellikleri çıkarılıyor...
  ✓ 4 POS özellik çıkarıldı
 Adım 2d: TF-IDF özellikleri çıkarılıyor...
   50 TF-IDF+PCA özellik çıkarıldı

 Toplam 65 özellik çıkarıldı (orijinal sıralama ile)!
 Feature matrix boyutu: (1989, 65)
 Labels boyutu: (1989,)

 Feature İstatistikleri:
  - Linguistik: 7 özellik
  - Sentiment: 4 özellik
  - POS Tags: 4 özellik
  - TF-IDF+PCA: 50 özellik
  - Toplam: 65 özellik


## Feature Scaling ve Polynomial Features - Orijinal Pipeline

In [11]:
print("=== FEATURE SCALING VE POLYNOMIAL FEATURES ===")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features.values)
print(f" Feature scaling tamamlandı: {X_scaled.shape}")

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_scaled)
print(f" Polynomial features oluşturuldu: {X_poly.shape}")

memory_usage_mb = X_poly.nbytes / (1024 * 1024)
print(f" Memory kullanımı: {memory_usage_mb:.1f} MB")

if memory_usage_mb > 1000:
    print(" Yüksek memory kullanımı tespit edildi!")
else:
    print(" Memory kullanımı kabul edilebilir seviyede")

print(f"\n Final feature pipeline:")
print(f"  Original features: {features.shape}")
print(f"  After scaling: {X_scaled.shape}")
print(f"  After polynomial: {X_poly.shape}")

=== FEATURE SCALING VE POLYNOMIAL FEATURES ===
 Feature scaling tamamlandı: (1989, 65)
 Polynomial features oluşturuldu: (1989, 2146)
 Memory kullanımı: 32.6 MB
 Memory kullanımı kabul edilebilir seviyede

 Final feature pipeline:
  Original features: (1989, 65)
  After scaling: (1989, 65)
  After polynomial: (1989, 2146)


## Model Test ve Tahmin

In [None]:
print("=== MODEL TEST VE TAHMİN ===")

if ensemble_model is None:
    print(" Model yüklenmediği için tahmin yapılamıyor!")
    print("Önce training notebook'unu çalıştırarak modeli eğitin.")
else:
    try:
        X_train, X_test, y_train, y_test = train_test_split(X_poly, labels, test_size=0.2, random_state=42)
        print(f" Train-test split yapıldı:")
        print(f"  Train: {X_train.shape}")
        print(f"  Test: {X_test.shape}")
        
        print("\n Ensemble model ile tahminler yapılıyor...")
        y_pred = ensemble_model.predict(X_test)
        
        print(f"\n MODEL BİLGİLERİ:")
        print(f"  Model tipi: {type(ensemble_model).__name__}")
        if hasattr(ensemble_model, 'estimators'):
            print(f"  Base estimators: {len(ensemble_model.estimators)}")
            for name, estimator in ensemble_model.estimators:
                print(f"    - {name}: {type(estimator).__name__}")
        
        def evaluate_model(model, X_test, y_test, y_pred):
            """Kapsamlı model değerlendirmesi"""
            
            print("\n=== MODEL PERFORMANSI ===")
            print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
            print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
            print("\nDetaylı Rapor:")
            print(classification_report(y_test, y_pred))
            
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title('Confusion Matrix')
            plt.ylabel('Gerçek')
            plt.xlabel('Tahmin')
            plt.show()
            
            return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
        
        accuracy, f1 = evaluate_model(ensemble_model, X_test, y_test, y_pred)
        
        print(f"\n SONUÇ ÖZETİ:")
        print(f"   Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
        print(f"   Test F1 Score: {f1:.4f}")
        
        baseline_accuracy = max(pd.Series(y_test).value_counts(normalize=True))
        improvement = accuracy - baseline_accuracy
        print(f"  Baseline Accuracy: {baseline_accuracy:.4f}")
        print(f"  Model İyileştirmesi: +{improvement:.4f} ({improvement*100:.2f} pp)")
        
        if accuracy > 0.55:
            print(f"   Çok iyi performans! (>55%)")
        elif accuracy > 0.50:
            print(f"   Kabul edilebilir performans (50-55%)")
        else:
            print(f"   Düşük performans (<50%)")
            
        if improvement > 0.05:
            print(f"   Baseline'dan anlamlı iyileştirme sağlandı!")
        else:
            print(f"   Baseline'dan sınırlı iyileştirme")
        
    except Exception as e:
        print(f" Tahmin sırasında hata oluştu: {e}")
        import traceback
        traceback.print_exc()

=== MODEL TEST VE TAHMİN ===
✓ Train-test split yapıldı:
  Train: (1591, 2146)
  Test: (398, 2146)

 Ensemble model ile tahminler yapılıyor...
 Tahmin sırasında hata oluştu: X has 2146 features, but MLPClassifier is expecting 2851 features as input.


Traceback (most recent call last):
  File "C:\Users\halil melih\AppData\Local\Temp\ipykernel_18404\102267486.py", line 14, in <module>
    y_pred = ensemble_model.predict(X_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\halil melih\AppData\Roaming\Python\Python312\site-packages\sklearn\ensemble\_voting.py", line 422, in predict
    maj = np.argmax(self.predict_proba(X), axis=1)
                    ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\halil melih\AppData\Roaming\Python\Python312\site-packages\sklearn\ensemble\_voting.py", line 463, in predict_proba
    self._collect_probas(X), axis=0, weights=self._weights_not_none
    ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\halil melih\AppData\Roaming\Python\Python312\site-packages\sklearn\ensemble\_voting.py", line 438, in _collect_probas
    return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
                       ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\halil melih\AppData\Roaming\Python\Python312\site-packages