In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import json

# Download NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

# Step 1: Load the JSONL file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

# Replace with actual path to your JSONL file
file_path = '/content/b2b_lead_data_india.jsonl'
print("Loading data from JSONL file...")
data = load_jsonl(file_path)
print(f"Loaded {len(data)} records from {file_path}.")

# Convert to DataFrame
print("Converting data to DataFrame...")
df = pd.DataFrame(data)
print(f"DataFrame created with shape: {df.shape}")

# Step 2: Preprocess the data
print("Preprocessing data...")
df['company_text'] = df['org_summary']

# Extract additional features from contact_info with safety checks
df['has_contact_title'] = df['contact_info'].apply(lambda x: 1 if isinstance(x, dict) and x.get('contact_title') is not None else 0)
df['has_phone'] = df['contact_info'].apply(lambda x: 1 if isinstance(x, dict) and x.get('phone') is not None else 0)
df['has_email'] = df['contact_info'].apply(lambda x: 1 if isinstance(x, dict) and x.get('email') is not None else 0)

# Add keyword overlap feature
def keyword_overlap(query, summary):
    query_words = set(query.lower().split()) - stop_words
    summary_words = set(summary.lower().split()) - stop_words
    if not query_words:
        return 0.0
    overlap = len(query_words.intersection(summary_words)) / len(query_words)
    return overlap

df['keyword_overlap'] = df.apply(lambda row: keyword_overlap(row['original_user_query'], row['org_summary']), axis=1)

# Map labels: "Good Fit" -> 1, others -> 0
df['label'] = df['user_feedback'].apply(lambda x: 1 if x == "Good Fit" else 0)
print("Preprocessing completed. Features extracted and labels mapped.")

# Step 3: Generate embeddings using Hugging Face Sentence Transformers
print("Initializing SentenceTransformer model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the original_user_query and company_text (org_summary only)
print("Generating embeddings for queries...")
query_embeddings = embedder.encode(df['original_user_query'].tolist(), convert_to_tensor=False)
print("Generating embeddings for company texts...")
company_embeddings = embedder.encode(df['company_text'].tolist(), convert_to_tensor=False)

# Compute cosine similarity with zero norm handling
print("Computing cosine similarities...")
def safe_cosine_sim(q, c):
    norm_q = np.linalg.norm(q)
    norm_c = np.linalg.norm(c)
    if norm_q == 0 or norm_c == 0:
        return 0.0  # Handle zero norm case
    return np.dot(q, c) / (norm_q * norm_c)

cosine_sim = np.array([safe_cosine_sim(q, c) for q, c in zip(query_embeddings, company_embeddings)])
print("Cosine similarities computed.")

# Combine embeddings and additional features
print("Combining features...")
X_embeddings = np.hstack((query_embeddings, company_embeddings))
X_additional = df[['has_contact_title', 'has_phone', 'has_email']].values
X_sim = cosine_sim.reshape(-1, 1)
X_overlap = df['keyword_overlap'].values.reshape(-1, 1)

X = np.hstack((X_embeddings, X_additional, X_sim, X_overlap))
print(f"Feature matrix X created with shape: {X.shape}")

y = df['label'].values
print(f"Labels y created with shape: {y.shape}")

# Step 4: Scale features
print("Scaling features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)
print("Features scaled.")

# Step 5: Split data
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train set: {X_train.shape[0]} samples, Test set: {X_test.shape[0]} samples")

# Step 6: Handle imbalance with SMOTE
print("Applying SMOTE oversampling...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Resampled train shapes: X={X_train_resampled.shape}, y={np.bincount(y_train_resampled)}")

# Step 7: Hyperparameter tuning with GridSearchCV
print("Running GridSearchCV for hyperparameter tuning...")
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'learning_rate': [0.05, 0.1],
    'scale_pos_weight': [1, 2, 3]
}

grid_search = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)
grid_search.fit(X_train_resampled, y_train_resampled)
model = grid_search.best_estimator_
print(f"Best params: {grid_search.best_params_}")
print(f"Best CV F1-macro: {grid_search.best_score_:.3f}")

# Step 8: Cross-validation on full dataset for robust evaluation
print("Running cross-validation on full dataset...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Step 9: Evaluate the tuned model on test set
print("Evaluating tuned model on test set...")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("Predicted Probabilities (Fit Scores) for Test Set:\n", y_pred_proba)

# Step 10: Save the model and scaler
import joblib
joblib.dump(model, 'xgboost_lead_scorer_optimized.pkl')
joblib.dump(scaler, 'feature_scaler_optimized.pkl')
print("Optimized model and scaler saved successfully!")

# Step 11: Function for prediction on new leads
def predict_fit_score(new_query, new_company_data):
    print("Predicting fit score for new lead...")
    # new_company_data: dict with 'org_summary', 'contact_info'
    new_company_text = new_company_data['org_summary']

    new_has_contact_title = 1 if isinstance(new_company_data['contact_info'], dict) and new_company_data['contact_info'].get('contact_title') is not None else 0
    new_has_phone = 1 if isinstance(new_company_data['contact_info'], dict) and new_company_data['contact_info'].get('phone') is not None else 0
    new_has_email = 1 if isinstance(new_company_data['contact_info'], dict) and new_company_data['contact_info'].get('email') is not None else 0

    print("Encoding new query and company text...")
    new_query_emb = embedder.encode([new_query])[0]
    new_company_emb = embedder.encode([new_company_text])[0]

    print("Computing cosine similarity for new lead...")
    new_cosine_sim = safe_cosine_sim(new_query_emb, new_company_emb)

    new_overlap = keyword_overlap(new_query, new_company_text)

    print("Combining features for new lead...")
    new_X_emb = np.concatenate((new_query_emb, new_company_emb))  # 1D array
    new_X_emb = np.expand_dims(new_X_emb, axis=0)  # Reshape to 2D: (1, n)
    new_X_additional = np.array([[new_has_contact_title, new_has_phone, new_has_email]])  # Shape: (1, 3)
    new_X_sim = np.array([[new_cosine_sim]])  # Shape: (1, 1)
    new_X_overlap = np.array([[new_overlap]])  # Shape: (1, 1)

    new_X = np.hstack((new_X_emb, new_X_additional, new_X_sim, new_X_overlap))  # Shape: (1, n+5)
    new_X = scaler.transform(new_X)

    fit_score = model.predict_proba(new_X)[0][1] * 100
    print(f"Fit score calculated: {fit_score:.2f}%")
    return fit_score

# Example usage
print("Running example prediction...")
new_query = "Find textile manufacturing companies in Bangalore looking for sustainable dyeing solutions."
new_company_data = {
    "org_summary": "EcoFabrics is a Bangalore-based textile manufacturer focusing on sustainable dyeing and organic cotton fabrics.",
    "contact_info": {"email": "sales@ecofabrics.com", "phone": "+91-80-1234-5678", "contact_title": "Sustainability Manager"}
}
score = predict_fit_score(new_query, new_company_data)
print(f"Fit Score for new lead: {score:.2f}%")