In [None]:
# 06_Validation_Prediction.ipynb
import pandas as pd
from config import *
from utils.init import *
import joblib

# Load validation data and best model
print("Loading validation data and best model...")
val_df = pd.read_csv(PROCESSED_VAL_PATH)
original_val_df = pd.read_csv(VALIDATION_DATA_PATH)
best_model = load_model(MODELS_DIR / 'best_model.pkl')

# Load vectorizers
title_vectorizer = joblib.load(MODELS_DIR / 'title_vectorizer.pkl')
text_vectorizer = joblib.load(MODELS_DIR / 'text_vectorizer.pkl')

print(f"Validation data shape: {val_df.shape}")

# Prepare validation features
print("Preparing validation features...")
val_title_tfidf = title_vectorizer.transform(val_df['title_clean'])
val_text_tfidf = text_vectorizer.transform(val_df['text_clean'])

from scipy.sparse import hstack
val_features = hstack([val_title_tfidf, val_text_tfidf])

print(f"Validation features shape: {val_features.shape}")

# Make predictions
print("Making predictions...")
predictions = best_model.predict(val_features)

# Create submission file
submission_df = original_val_df.copy()
submission_df['label'] = predictions

# Ensure we have predictions for all rows
assert len(submission_df) == len(original_val_df), "Prediction length mismatch!"

# Save predictions
submission_path = PREDICTIONS_DIR / 'validation_predictions.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Predictions saved to: {submission_path}")
print("\nPrediction distribution:")
print(submission_df['label'].value_counts())

# Accuracy estimation (based on test performance)
test_accuracy = 0.996569  
print(f"\nEstimated accuracy on validation set: {test_accuracy:.4%}")
print("Note: This is an estimate based on test set performance.")

# Sample of predictions
print("\nSample predictions:")
sample = submission_df.head(10)[['title', 'label']]
print(sample)

Loading validation data and best model...
Validation data shape: (4956, 7)
Preparing validation features...
Validation features shape: (4956, 7500)
Making predictions...
Predictions saved to: c:\Users\Amin\Documents\Ironhack_projects\project-nlp-challenge\predictions\validation_predictions.csv

Prediction distribution:
label
0    3492
1    1464
Name: count, dtype: int64

Estimated accuracy on validation set: 99.6569%
Note: This is an estimate based on test set performance.

Sample predictions:
                                               title  label
0  UK's May 'receiving regular updates' on London...      1
1  UK transport police leading investigation of L...      1
2  Pacific nations crack down on North Korean shi...      1
3  Three suspected al Qaeda militants killed in Y...      1
4  Chinese academics prod Beijing to consider Nor...      1
5  Flames raced along train at west London statio...      1
6  London police advise people to avoid area near...      1
7  London ambulance s