In [None]:
from datasets import load_dataset
ds = load_dataset("open-r1/codeforces", "default")

In [None]:
import pandas as pd
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()
train_df

In [None]:
test_df

In [None]:
data=pd.concat([train_df, test_df], axis=0)
print(data.shape)
data

In [None]:
print(data.shape,data.columns)

In [None]:
data = data[data['interaction_format'].isna()]
print(data.shape)

In [None]:
selected_columns = [
    'id',
    'index',
    'time_limit',
    'memory_limit',
    'description',
    'input_format',
    'output_format',
    'examples',
    'rating',
    'tags',
    'testset_size',
    'official_tests'
]

data = data[selected_columns]
print(data.shape)
print(data.columns)

In [None]:
data.isna().sum()

In [None]:
data.dropna(inplace=True)
print(data.shape)

- Even after preprocessing, the dataset still contains 9000+ rows, which is sufficient for training the model.
- Since the data is collected from real-world sources, the missing values do not follow a clear pattern and are difficult to fill reliably using any imputation technique.

- Therefore, rows with missing values are safely removed to avoid introducing incorrect or misleading information into the model, while still retaining enough data for effective training.

### Based on CP domain knowledge
- i) **Rating ≤ 1200 → Easy**
These problems typically require basic data structures, simple logic, or straightforward implementation and are solvable by beginners.

- ii) **1200 < Rating ≤ 1800 → Medium**
These problems usually involve intermediate algorithms, careful observations, or standard problem-solving techniques that require practice and experience.

- iii) **Rating > 1800 → Hard**
These problems often demand advanced algorithms, mathematical insights, or non-trivial optimizations, and are generally targeted at experienced participants.

In [None]:
def get_problem_class(rating):
    if rating <= 1200:
        return "Easy"
    elif rating <= 1800:
        return "Medium"
    else:
        return "Hard"
data['problem_class'] = data['rating'].apply(get_problem_class)
data['problem_class']

In [None]:
data['problem_class'].value_counts()

In [None]:
def rating_to_score(rating, min_rating=800, max_rating=3500):
    #Transforms rating from [800, 3500] to score in [0, 10] for problem_score column
    # Clip to avoid out-of-range values
    rating = max(min_rating, min(rating, max_rating))

    score = 10 * (rating - min_rating) / (max_rating - min_rating)
    return round(score, 2)
data['problem_score']=data['rating'].apply(rating_to_score)
data

In [None]:
print(data['index'].unique(),len(data['index'].unique()))

In [None]:
import re
import string
import pandas as pd

LETTER_SCORE = {ch: i + 1 for i, ch in enumerate(string.ascii_uppercase)}
LETTER_SCORE['Z'] = 0   # for numeric-only indices like 01, 02, etc.

def parse_problem_index(idx):
    idx = str(idx).strip()

    match = re.match(r'^([A-Z]+)?(\d+)?$', idx)

    letter = match.group(1) if match and match.group(1) else 'Z'
    number = int(match.group(2)) if match and match.group(2) else 0

    return letter, number

def index_to_score(idx):
    letter, number = parse_problem_index(idx)
    return LETTER_SCORE[letter] * 10 + number

data['index_score'] = data['index'].apply(index_to_score)

print("Unique index scores:", data['index_score'].nunique(),data['index_score'].unique())

In [None]:
data.drop(['index'],axis=1,inplace=True)
print(data.shape,data.columns)

In [None]:
data['problem_statement']=data['description']+data['input_format']+data['output_format']
data['problem_statement']

In [None]:
def text_length(text):
    return len(text.split())

def avg_sentence_length(text):
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    if len(sentences) == 0:
        return 0

    sentence_lengths = [len(s.split()) for s in sentences]
    return np.mean(sentence_lengths)

FORMULA_SYMBOLS = r'[=<>+\-*/%^()]'
def formula_symbol_count(text):
    return len(re.findall(FORMULA_SYMBOLS, text))

def extract_text_features(text):
    return pd.Series({
        "text_length": text_length(text),
        "avg_sentence_length": avg_sentence_length(text),
        "formula_symbol_count": formula_symbol_count(text)
    })

In [None]:
import numpy as np
features = data["problem_statement"].apply(extract_text_features)
print(features.shape)
data = pd.concat([data, features], axis=1)
print(data.shape)
data

In [None]:
import pandas as pd
from collections import Counter
tag_counter=Counter()
for i in range(data.shape[0]):
  arr=data.iloc[i,8]
  tag_counter.update(arr)
for tag,freq in tag_counter.items():
    print(tag,freq)
print(len(tag_counter))

In [None]:
data = data.reset_index(drop=True)
tags_df = data['tags'].explode().str.get_dummies().groupby(level=0).sum()

data = pd.concat([data, tags_df], axis=1)
print(tags_df.shape)

print(f"New data shape: {data.shape}")

In [None]:
import re
def parse_multi_test_example(example_list):
    raw_input = example_list[0]['input']
    lines = [l for l in raw_input.split('\n') if l.strip()]
    t_cases = int(lines[0])
    all_nums = [int(n) for n in re.findall(r'\d+', " ".join(lines[1:]))]
    max_val = max(all_nums) if all_nums else 0
    return lines,t_cases, max_val


In [None]:
import numpy as np

def extract_structural_features(example_list):
    all_ex_in_char_per_line = []
    all_ex_out_char_per_line = []
    all_ex_line_counts = []

    for ex in example_list:
        inp_str = str(ex.get('input', '')).strip()
        in_lines = inp_str.split('\n')
        in_chars_in_this_ex = 0
        for line in in_lines:
            chars = line.split()
            in_chars_in_this_ex += sum(len(c) for c in chars)

        out_str = str(ex.get('output', '')).strip()
        out_lines = out_str.split('\n')

        out_chars_in_this_ex = 0
        for line in out_lines:
            chars = line.split()
            out_chars_in_this_ex += sum(len(c) for c in chars)

        all_ex_in_char_per_line.append(in_chars_in_this_ex / len(in_lines) if in_lines else 0)
        all_ex_out_char_per_line.append(out_chars_in_this_ex / len(out_lines) if out_lines else 0)
        all_ex_line_counts.append(len(in_lines))

    return (
        np.mean(all_ex_in_char_per_line), # Avg chars per line (Input)
        np.mean(all_ex_out_char_per_line), # Avg chars per line (Output)
        np.mean(all_ex_line_counts),       # Avg number of lines
        len(example_list)                 # Number of samples
    )

In [None]:
new_cols = ['avg_in_char_per_line', 'avg_out_char_per_line', 'avg_line_count', 'sample_count']
data[new_cols] = data['examples'].apply(lambda x: pd.Series(extract_structural_features(x)))

In [None]:
data.dropna(inplace=True)
print(data.shape,data.columns)
data

In [None]:
!pip install gensim

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')

In [None]:
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

tokenized_statements = [word_tokenize(text.lower()) for text in data['problem_statement']]

w2v_model = Word2Vec(sentences=tokenized_statements,
                     vector_size=300,
                     window=5,
                     min_count=2,
                     workers=4)

def get_mean_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

embeddings = np.array([get_mean_vector(tokens, w2v_model) for tokens in tokenized_statements])

In [None]:
print(embeddings.shape)
embeddings

### We will have to save the word2vec model for creating embeddings for new tests for making the web appplication

In [None]:
w2v_model.save("word2vec_problem_solver.model")

In [None]:
print(data.columns)

- i) ID column was removed because its information is already captured using index_score. Keeping both would be repetitive.

- ii) Description, input format, and output format were merged into one column called problem_statement so that all text related to the problem is present in a single place.

- iii) The problem_statement column was converted into numbers using Word2Vec embeddings. After converting it to vectors, the original text column was dropped and only the embeddings were used.

- iv) problem_score and problem_class are the values we want the model to predict, so they were removed from the input features.

- v) testset_size and official_tests are usually not available when a new problem is given. Even though they exist in the dataset, they were not used so that the model works properly during real testing.

In [None]:
X = data.drop(['id','description','input_format','output_format',
    'examples','rating','tags','problem_score','problem_class','testset_size',
    'official_tests','problem_statement'
], axis=1)

y_reg = data['problem_score']
y_class= data['problem_class']


In [None]:
y_reg

In [None]:
y_class.value_counts()

In [None]:
print(X.columns)
print(X.shape)

In [None]:
embeddings=pd.DataFrame(embeddings,index=X.index)
embeddings.columns = [f"emb_{i}" for i in range(embeddings.shape[1])]
print(embeddings.shape)
embeddings

In [None]:
X = pd.concat([X, embeddings], axis=1)
print(X.shape)
X

In [None]:
data['id'] = data['id'].str.replace('/', '', regex=False)

In [None]:
import pandas as pd

# Updated Domain Logic (The hierarchy of knowledge)
#This novelty is explained in detail in the report
logic_triples = [
    # --- TOP-LEVEL CATEGORIES ---
    ['math', 'Contains', 'numbertheory'],
    ['math', 'Contains', 'combinatorics'],
    ['math', 'Contains', 'probabilities'],
    ['math', 'Contains', 'geometry'],
    ['math', 'Contains', 'matrices'],
    ['math', 'Contains', 'fft'],

    ['graphs', 'Contains', 'trees'],
    ['graphs', 'Contains', 'shortestpaths'],
    ['graphs', 'Contains', 'flows'],
    ['graphs', 'Contains', 'graphmatchings'],
    ['graphs', 'Contains', 'dfsandsimilar'],
    ['graphs', 'Contains', '2-sat'],
    ['graphs', 'Contains', 'dsu'],

    ['strings', 'Contains', 'hashing'],
    ['strings', 'Contains', 'stringsuffixstructures'],
    ['strings', 'Contains', 'expressionparsing'],

    # --- ALGORITHMIC PARADIGMS ---
    ['bruteforce', 'Prerequisite', 'implementation'],
    ['sortings', 'Prerequisite', 'greedy'],
    ['greedy', 'Prerequisite', 'constructivealgorithms'],
    ['divideandconquer', 'Prerequisite', 'fft'],
    ['binarysearch', 'Prerequisite', 'ternarysearch'],
    ['bitmasks', 'Prerequisite', 'dp'],
    ['dp', 'Prerequisite', 'games'],

    # --- ADVANCED TECHNIQUES & SPECIALS ---
    ['twopointers', 'RelatesTo', 'sortings'],
    ['meet-in-the-middle', 'RelatesTo', 'bruteforce'],
    ['interactive', 'RelatesTo', 'binarysearch'], # Common CP pattern
    ['chineseremaindertheorem', 'SubTopicOf', 'numbertheory'],
    ['schedules', 'SubTopicOf', 'greedy'],
    ['*special', 'Category', 'miscellaneous'],

    # --- IMPLEMENTS (Data Structures) ---
    ['datastructures', 'Implements', 'dp'],
    ['datastructures', 'Implements', 'graphs'],
    ['dsu', 'Implements', 'graphs']
]

# --- Dynamic Mapping (Solves Relation) ---
problem_triples = []
for idx, row in data.iterrows():
    p_id = str(row['id'])
    # Standardizing tag names (removing spaces/hyphens to match logic_triples)
    raw_tags = row['tags']
    clean_tags = [t.lower().replace(" ", "").replace("-", "") for t in raw_tags]

    for tag in clean_tags:
        problem_triples.append([tag, 'Solves', p_id])

kg_df = pd.DataFrame(logic_triples + problem_triples, columns=['head', 'relation', 'tail'])
print(f"Created KG with {len(kg_df)} triples.")

In [None]:
!pip install pykeen

from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import torch

# Load your triples into PyKEEN's format
tf = TriplesFactory.from_labeled_triples(
    kg_df.values
)

# Split for training and testing logic
training, testing = tf.split([0.8, 0.2])

In [None]:
import torch
result = pipeline(
    training=training,
    testing=testing,
    model='TransE',
    model_kwargs=dict(embedding_dim=128), # 128 dimensions to balance detail vs memory
    training_kwargs=dict(num_epochs=100, use_tqdm=True),
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

In [None]:
print(result)

In [None]:
transE_model = result.model
entity_to_id = tf.entity_to_id

def get_kg_embedding(entity_name):
    if entity_name in entity_to_id:
        idx = entity_to_id[entity_name]
        return transE_model.entity_representations[0](indices=torch.tensor([idx])).detach().cpu().numpy()[0]
    else:
        # Returns a vector of zeros if the entity is not in the KG
        return [0] * 128

import numpy as np
import pandas as pd

# Extracting features using the renamed model
kg_features = np.array([get_kg_embedding(str(pid)) for pid in data['id']])
kg_df_features = pd.DataFrame(kg_features, columns=[f'kg_{i}' for i in range(128)])

In [None]:
print(transE_model)

In [None]:
import torch
torch.save(transE_model, "transe_model.pt")

In [None]:
import json
with open("entity_to_id.json", "w") as f:
    json.dump(entity_to_id, f)

In [None]:
print(kg_df_features.shape)
kg_df_features

In [None]:
print("The shape of X is ",X.shape)
print("The shape of kg_df_features is ",kg_df_features.shape)

In [None]:
X_clean = X.reset_index(drop=True)
kg_clean = kg_df_features.reset_index(drop=True)

if len(X_clean) == len(kg_clean):
    X_combined = pd.concat([X_clean, kg_clean], axis=1)
    print(f"New Shape: {X_combined.shape}")
else:
    print(f"Mismatch Error! X: {len(X_clean)}, KG: {len(kg_clean)}")


In [None]:
X=X_combined
print(X.shape)
print(X.columns)
X

In [None]:
X = X.reset_index(drop=True)
y_reg = y_reg.reset_index(drop=True)
y_class = y_class.reset_index(drop=True)


In [None]:
print(X.shape,X.columns)

In [None]:
X_train, X_test, y_reg_train, y_reg_test, y_class_train, y_class_test = train_test_split(
    X,
    y_reg,
    y_class,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)

In [None]:
X_train

In [None]:
y_class_train

In [None]:
y_reg_train

In [None]:
X_test

In [None]:
y_class_test

In [None]:
y_reg_test

In [None]:
print(y_class_train.value_counts(),y_class_test.value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_class_train_enc = le.fit_transform(y_class_train)
y_class_test_enc  = le.transform(y_class_test)
#will be required for XGBoost and LightGBM models


In [None]:
!pip install catboost

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", random_state=42),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "SVM": SVC(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        auto_class_weights='Balanced',
        verbose=0,
        random_seed=42
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",
        num_class=len(le.classes_),
        eval_metric="mlogloss",
        random_state=42
    ),
    "LightGBM": LGBMClassifier(class_weight="balanced", random_state=42),
}

models_results = {}

for model_name, model in models.items():

    if model_name in ["XGBoost", "LightGBM"]:
        model.fit(X_train, y_class_train_enc)
        y_pred = model.predict(X_test)
        y_pred = le.inverse_transform(y_pred)
        y_true = y_class_test

    else:
        model.fit(X_train, y_class_train)
        y_pred = model.predict(X_test)
        y_true = y_class_test

    report = classification_report(
        y_true,
        y_pred,
        output_dict=True
    )

    models_results[model_name] = report

    print(f"Model: {model_name}")
    print(classification_report(y_true, y_pred))
    print("-" * 50)


In [None]:
classification_models_results=models_results
print(classification_models_results)

In [None]:
classification_results_df = pd.DataFrame(classification_models_results)
classification_results_df

In [None]:
classification_results_df.to_csv("classification_models_results.csv", index=True)

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

lgbm_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=3,
    class_weight='balanced',
    random_state=42
)

lgbm_model.fit(X_train, y_class_train)

y_pred = lgbm_model.predict(X_test)

accuracy = accuracy_score(y_class_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

macro_f1 = f1_score(y_class_test, y_pred, average='macro')
print(f"Macro F1: {macro_f1:.4f}")

cm = confusion_matrix(y_class_test, y_pred, labels=['Easy', 'Medium', 'Hard'])

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Easy', 'Medium', 'Hard'], yticklabels=['Easy', 'Medium', 'Hard'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

print("\nClassification Report:")
print(classification_report(y_class_test, y_pred, target_names=['Easy', 'Medium', 'Hard']))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

importances = lgbm_model.feature_importances_
feature_names = X.columns

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

def feature_type(f):
    if f.startswith('emb_'):
        return 'Embedding'
    elif f.startswith('kg_'):
        return 'KG'
    else:
        return 'Dense'

feature_importance_df['type'] = feature_importance_df['feature'].apply(feature_type)

feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=True)
for i in range(feature_importance_df.shape[0]):
    print(f"{feature_importance_df.iloc[i, 0]} ({feature_importance_df.iloc[i, 2]}): {feature_importance_df.iloc[i, 1]}")

# Plotting top 20 features according to feature_importance as seen by lgbm_model
top_20 = feature_importance_df.sort_values(by='importance', ascending=False).head(20)

plt.figure(figsize=(12, 8))
colors = {'Embedding': 'skyblue', 'KG': 'lightgreen', 'Dense': 'salmon'}
bar_colors = [colors[t] for t in top_20['type']]

bars = plt.barh(top_20['feature'], top_20['importance'], color=bar_colors)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Feature Importances by Type')
plt.gca().invert_yaxis()

from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=l) for l, c in colors.items()]
plt.legend(handles=legend_elements, title='Feature Type')

plt.show()

In [None]:
import catboost
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

cat_model = catboost.CatBoostClassifier(
    random_state=42,
    verbose=0,
    class_weights=[1,1,1]
)

cat_model.fit(X_train, y_class_train)

y_pred = cat_model.predict(X_test)

accuracy = accuracy_score(y_class_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

macro_f1 = f1_score(y_class_test, y_pred, average='macro')
print(f"Macro F1: {macro_f1:.4f}")

cm = confusion_matrix(y_class_test, y_pred, labels=['Easy', 'Medium', 'Hard'])

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Easy', 'Medium', 'Hard'],
            yticklabels=['Easy', 'Medium', 'Hard'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('CatBoost Confusion Matrix')
plt.show()

print("\nClassification Report:")
print(classification_report(y_class_test, y_pred, target_names=['Easy', 'Medium', 'Hard']))


In [None]:
import joblib
import json
joblib.dump(lgbm_model, "classification_lgbm_model.pkl")
joblib.dump(cat_model, "classification_catboost_model.pkl")

In [None]:
### Regression Task

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler


In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'AdaBoost Regressor': AdaBoostRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM Regressor': LGBMRegressor(random_state=42),
    'CatBoost Regressor': CatBoostRegressor(random_state=42, verbose=0)
}

y_reg_train_np = y_reg_train.to_numpy().reshape(-1, 1)
y_reg_test_np  = y_reg_test.to_numpy().reshape(-1, 1)

# Scale y_reg_train to [0,1]
scaler = MinMaxScaler(feature_range=(0, 1))
y_reg_train_scaled = scaler.fit_transform(y_reg_train_np)

regression_models_results = {}

for model_name, model in models.items():

    # Train model on scaled y
    model.fit(X_train, y_reg_train_scaled.ravel())

    # Predict in scaled space
    y_pred_scaled = model.predict(X_test)

    # Clip predictions to [0,1]
    y_pred_scaled = np.clip(y_pred_scaled, 0, 1)

    # Convert back to original range
    y_pred = scaler.inverse_transform(
        y_pred_scaled.reshape(-1, 1)
    ).ravel()

    # Metrics in original scale[0-10]
    mae = mean_absolute_error(y_reg_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred))
    r2 = r2_score(y_reg_test, y_pred)

    print(f"{model_name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2 Score: {r2:.4f}")

    regression_models_results[model_name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R2 Score': r2
    }


In [None]:
reg_results_df = pd.DataFrame(regression_models_results)
reg_results_df

In [None]:
reg_results_df.to_csv("regression_models_results.csv", index=True)

In [None]:
import lightgbm as lgb
import joblib
import pandas as pd
import numpy as np

# Define the model with default settings
lgbm_regressor = lgb.LGBMRegressor(random_state=42)

# Train on scaled data
lgbm_regressor.fit(X_train, y_reg_train_scaled.ravel())

# Predict and clip
y_pred_scaled = lgbm_regressor.predict(X_test)
y_pred_scaled = np.clip(y_pred_scaled, 0, 1)

# Inverse transform back to 0-10 scale
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Final Metrics
mae = mean_absolute_error(y_reg_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred))
r2 = r2_score(y_reg_test, y_pred)

print(f"LGBM Regressor (Default) - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2 Score: {r2:.4f}")

# Update the results dictionary
regression_models_results['LightGBM Regressor'] = {
    'MAE': mae,
    'RMSE': rmse,
    'R2 Score': r2
}

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

importances = lgbm_regressor.feature_importances_
feature_names = X.columns

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

def feature_type(f):
    if f.startswith('emb_'):
        return 'Embedding'
    elif f.startswith('kg_'):
        return 'KG'
    else:
        return 'Dense'

feature_importance_df['type'] = feature_importance_df['feature'].apply(feature_type)

feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=True)
for i in range(feature_importance_df.shape[0]):
    print(f"{feature_importance_df.iloc[i, 0]} ({feature_importance_df.iloc[i, 2]}): {feature_importance_df.iloc[i, 1]}")

# Plotting top 20 features according to feature_importance as seen by lgbm_model
top_20 = feature_importance_df.sort_values(by='importance', ascending=False).head(20)

plt.figure(figsize=(12, 8))
colors = {'Embedding': 'skyblue', 'KG': 'lightgreen', 'Dense': 'salmon'}
bar_colors = [colors[t] for t in top_20['type']]

bars = plt.barh(top_20['feature'], top_20['importance'], color=bar_colors)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Feature Importances by Type')
plt.gca().invert_yaxis()

from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=l) for l, c in colors.items()]
plt.legend(handles=legend_elements, title='Feature Type')

plt.show()

In [None]:
joblib.dump(lgbm_regressor, "regression_lgbm_model.pkl")

In [None]:
import catboost
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the CatBoost regressor with default settings
cat_regressor = catboost.CatBoostRegressor(random_state=42, verbose=0)

# Train on scaled data
cat_regressor.fit(X_train, y_reg_train_scaled.ravel())

# Predict on test set
y_pred_scaled = cat_regressor.predict(X_test)

# Clip predictions to valid range if necessary (0-1)
y_pred_scaled = np.clip(y_pred_scaled, 0, 1)

# Inverse transform back to original scale (0-10)
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Compute final metrics
mae = mean_absolute_error(y_reg_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred))
r2 = r2_score(y_reg_test, y_pred)

print(f"CatBoost Regressor (Default) - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2 Score: {r2:.4f}")



In [None]:
joblib.dump(cat_regressor, "regression_catboost_model.pkl")