In [19]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
import joblib
import warnings
warnings.filterwarnings('ignore')

In [20]:
def load_data():
    questions = pd.read_csv("Questions.csv", encoding='latin1')
    tags = pd.read_csv("Tags.csv", encoding='latin1')
    
    # Convert tags to list per question
    tags_grouped = tags.groupby('Id')['Tag'].apply(list).reset_index()
    return pd.merge(questions, tags_grouped, on='Id', how='inner')

df = load_data()

In [21]:
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    # Preserve code blocks
    text = re.sub(r'<code>(.*?)</code>', r' CODEBLOCK \1 CODEBLOCK ', text, flags=re.DOTALL)
    # Remove other HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Keep useful punctuation
    text = re.sub(r'[^\w\s.,!?]', ' ', text)
    # Normalize
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

print("Cleaning text...")
df['Text'] = df['Title'].apply(clean_text) + ' ' + df['Body'].apply(clean_text)

Cleaning text...


In [38]:
# Add this right after your text cleaning step
df['Text'] = df['Text'].apply(lambda x: re.sub(r'(&lt;|&gt;)', ' ', x))  # Clean HTML entities

In [35]:
tag_counts = Counter(tag for tags in df['Tag'] for tag in tags)
top_tags = [tag for tag, _ in tag_counts.most_common(10)]
print(f"Top 10 tags: {top_tags}")

df['FilteredTags'] = df['Tag'].apply(lambda tags: [tag for tag in tags if tag in top_tags])
df = df[df['FilteredTags'].map(len) > 0]

Top 10 tags: ['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python', 'html', 'c++', 'ios']


In [36]:
print("Adding features...")
def add_features(df):
    # Existing features
    df = df.copy()
    df['CodeBlockCount'] = df['Body'].astype(str).str.count('CODEBLOCK')
    df['CodeLength'] = df['Body'].astype(str).str.extract(r'CODEBLOCK (.*?) CODEBLOCK').str.len().fillna(0)
    df['JsFeatures'] = df['Text'].str.count('function|=>|var |let ')
    
    # Enhanced HTML features (NEW)
    df['HtmlTags'] = df['Text'].str.count(r'<\w+')  # Count HTML tag openings
    df['CssProps'] = df['Text'].str.count(r'\{[^}]+\}')  # Count CSS blocks
    df['InlineStyles'] = df['Text'].str.count('style="')  # Count inline styles
    df['HtmlFeatures'] = df['HtmlTags'] + df['CssProps'] + df['InlineStyles']  # Combined feature
    
    return df

# Safely apply feature engineering
try:
    df = add_features(df)
    print("Features added successfully!")
except Exception as e:
    print(f"Error adding features: {str(e)}")
    print("Available columns:", df.columns.tolist())

Adding features...
Error adding features: 'DataFrame' object has no attribute 'str'
Available columns: ['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body', 'Tag', 'Text', 'FilteredTags', 'CodeBlockCount', 'CodeLength', 'HtmlFeatures', 'JsFeatures']


In [39]:
mlb = MultiLabelBinarizer(classes=top_tags)
y = mlb.fit_transform(df['FilteredTags'])
X_text = df['Text']
X_features = df[['CodeBlockCount', 'CodeLength', 'HtmlFeatures', 'JsFeatures']]

# Train-test split
X_train_t, X_test_t, X_train_f, X_test_f, y_train, y_test = train_test_split(
    X_text, X_features, y, test_size=0.2, random_state=42
)

In [40]:
import scipy.sparse as sp

print("Vectorizing text with memory optimization...")

# 1. Vectorize text (keep sparse format)
text_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,  # Reduced from 25k
        ngram_range=(1, 2),
        stop_words='english',
        sublinear_tf=True
    ))
])

X_train_text = text_pipe.fit_transform(X_train_t)
X_test_text = text_pipe.transform(X_test_t)

# 2. Convert feature columns to sparse format
X_train_f_sparse = sp.csr_matrix(X_train_f)
X_test_f_sparse = sp.csr_matrix(X_test_f)

# 3. Combine using sparse hstack (memory efficient)
X_train = sp.hstack([X_train_text, X_train_f_sparse], format='csr')
X_test = sp.hstack([X_test_text, X_test_f_sparse], format='csr')

# 4. Use sparse-compatible classifier
model = OneVsRestClassifier(
    LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        penalty='l2'
    ),
    n_jobs=-1
)

# 5. Train directly on sparse data
print("Training on sparse matrices...")
model.fit(X_train, y_train)

# Save memory-efficient artifacts
joblib.dump({
    'model': model,
    'text_pipe': text_pipe,
    'mlb': mlb,
    'sparse': True  # Flag for prediction-time handling
}, 'sparse_model.joblib', compress=3)

Vectorizing text with memory optimization...
Training on sparse matrices...


['sparse_model.joblib']

In [41]:
print("Training model...")
model = OneVsRestClassifier(
    LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        max_iter=1000,
        C=0.9,
        penalty='l2'
    ),
    n_jobs=-1
)
model.fit(X_train, y_train)

Training model...


In [42]:
print("Optimizing thresholds...")
y_proba = model.predict_proba(X_test)
optimal_thresholds = {}
for i, tag in enumerate(mlb.classes_):
    precision, recall, thresholds = precision_recall_curve(y_test[:,i], y_proba[:,i])
    f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
    optimal_thresholds[tag] = thresholds[np.argmax(f1)]

Optimizing thresholds...


In [46]:
y_pred = np.zeros_like(y_proba)
for i, tag in enumerate(mlb.classes_):
    y_pred[:,i] = (y_proba[:,i] >= optimal_thresholds[tag]).astype(int)

In [47]:
print("\nEnhanced Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


Enhanced Classification Report:
              precision    recall  f1-score   support

  javascript       0.76      0.86      0.81     24832
        java       0.88      0.83      0.85     23106
          c#       0.87      0.88      0.88     20226
         php       0.91      0.92      0.92     19637
     android       0.96      0.93      0.94     18274
      jquery       0.83      0.81      0.82     15703
      python       0.97      0.94      0.95     12824
        html       0.59      0.70      0.64     11966
         c++       0.88      0.85      0.86      9483
         ios       0.93      0.92      0.93      9466

   micro avg       0.85      0.87      0.86    165517
   macro avg       0.86      0.86      0.86    165517
weighted avg       0.86      0.87      0.86    165517
 samples avg       0.86      0.89      0.86    165517



In [48]:
print("\nSaving model artifacts...")
joblib.dump(model, 'model.joblib')
joblib.dump(text_pipe, 'text_pipe.joblib')
joblib.dump(mlb, 'mlb.joblib')
joblib.dump(optimal_thresholds, 'thresholds.joblib')

print("\nOptimized model training complete!")


Saving model artifacts...

Optimized model training complete!
