In [2]:
!pip install catboost xgboost scikit-learn
!pip install openpyxl

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m4.0 MB/

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor

from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# === Load Excel file ===
file_path = '/content/chunked_code_embeddings_with_marks.xlsx'
df = pd.read_excel(file_path)

# === Feature and target split ===
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(f"Loaded dataset → X shape: {X.shape}, y shape: {y.shape}")

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === CatBoost Regressor ===
cat_model = CatBoostRegressor(verbose=0, random_seed=42)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)

mse_cat = mean_squared_error(y_test, y_pred_cat)
r2_cat = r2_score(y_test, y_pred_cat)
print(f"CatBoost → MSE: {mse_cat:.4f}, R2: {r2_cat:.4f}")

# === XGBoost Regressor ===
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost → MSE: {mse_xgb:.4f}, R2: {r2_xgb:.4f}")

# === Stacking Regressor ===
base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.1)),
    ('mlp', MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=500, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_seed=42))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(),  # You can also try LinearRegression() or another model
    passthrough=True,  # allows the final estimator to also see the original features
    n_jobs=-1
)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)

mse_stack = mean_squared_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)
print(f"Stacking Regressor → MSE: {mse_stack:.4f}, R2: {r2_stack:.4f}")


Loaded dataset → X shape: (1200, 3072), y shape: (1200,)
CatBoost → MSE: 1.7222, R2: 0.5565
XGBoost → MSE: 1.7618, R2: 0.5463
Stacking Regressor → MSE: 1.8017, R2: 0.5360


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# === Load Excel file ===
file_path = '/content/code_with_error_embeddings_with_marks.xlsx'
df = pd.read_excel(file_path)

# === Keep only numeric columns ===
df_numeric = df.select_dtypes(include=[np.number])

# === Feature and target split ===
X = df_numeric.iloc[:, :-1].values
y = df_numeric.iloc[:, -1].values
print(f"Loaded dataset → X shape: {X.shape}, y shape: {y.shape}")

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === CatBoost Regressor ===
cat_model = CatBoostRegressor(verbose=0, random_seed=42)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
print(f"CatBoost → MSE: {mean_squared_error(y_test, y_pred_cat):.4f}, R2: {r2_score(y_test, y_pred_cat):.4f}")

# === XGBoost Regressor ===
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(f"XGBoost → MSE: {mean_squared_error(y_test, y_pred_xgb):.4f}, R2: {r2_score(y_test, y_pred_xgb):.4f}")

# === Stacking Regressor ===
base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.1)),
    ('mlp', MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=500, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_seed=42))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(),
    passthrough=True,
    n_jobs=-1
)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
print(f"Stacking Regressor → MSE: {mean_squared_error(y_test, y_pred_stack):.4f}, R2: {r2_score(y_test, y_pred_stack):.4f}")


Loaded dataset → X shape: (821, 768), y shape: (821,)
CatBoost → MSE: 4.8054, R2: -0.2263
XGBoost → MSE: 5.0890, R2: -0.2987
Stacking Regressor → MSE: 4.2086, R2: -0.0740


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# === Load Excel file ===
file_path = '/content/Chunked_Embedding_Python_code(logic).xlsx'

# Attempt to load the file with error handling
try:
    # Check if it's a valid Excel file (if it's not, attempt to load as CSV)
    df = pd.read_excel(file_path, engine='openpyxl')
    print("Excel file loaded successfully.")
except (ValueError, FileNotFoundError, BadZipFile) as e:
    print(f"Error loading file: {e}")
    print("Attempting to load as CSV...")
    try:
        df = pd.read_csv(file_path, encoding='ISO-8859-1')  # Try alternative encoding for CSV
        print("CSV file loaded successfully.")
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        raise

# === Keep only numeric columns ===
df_numeric = df.select_dtypes(include=[np.number])

# Check if the data has enough numeric columns
if df_numeric.shape[1] < 2:
    raise ValueError("Not enough numeric columns for model training.")

# === Feature and target split ===
X = df_numeric.iloc[:, :-1].values
y = df_numeric.iloc[:, -1].values
print(f"Loaded dataset → X shape: {X.shape}, y shape: {y.shape}")

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === CatBoost Regressor ===
cat_model = CatBoostRegressor(verbose=0, random_seed=42)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
cat_mse = mean_squared_error(y_test, y_pred_cat)
cat_r2 = r2_score(y_test, y_pred_cat)
print(f"CatBoost → MSE: {cat_mse:.4f}, R2: {cat_r2:.4f}")

# === XGBoost Regressor ===
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_pred_xgb)
xgb_r2 = r2_score(y_test, y_pred_xgb)
print(f"XGBoost → MSE: {xgb_mse:.4f}, R2: {xgb_r2:.4f}")

# === Stacking Regressor ===
base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.1)),
    ('mlp', MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=500, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_seed=42))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(),
    passthrough=True,
    n_jobs=-1
)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
stack_mse = mean_squared_error(y_test, y_pred_stack)
stack_r2 = r2_score(y_test, y_pred_stack)
print(f"Stacking Regressor → MSE: {stack_mse:.4f}, R2: {stack_r2:.4f}")


Excel file loaded successfully.
Loaded dataset → X shape: (1200, 3071), y shape: (1200,)
CatBoost → MSE: 0.0000, R2: 0.9992
XGBoost → MSE: 0.0000, R2: 0.9973
Stacking Regressor → MSE: 0.0000, R2: 0.9997


In [6]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

# Step 1: Load your data from Excel
df = pd.read_excel('/content/chunked_code_embeddings_with_marks.xlsx')  # Replace with your actual Excel file path

# Show the first few rows to inspect the data
print(df.head())

# Assuming the last column is the target and the rest are features
X = df.iloc[:, :-1].values  # Features (all rows, all columns except last)
y = df.iloc[:, -1].values   # Target (last column)

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Step 2: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply PCA
n_components = 300  # Adjust as needed
pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print(f"Original shape: {X.shape}")
print(f"Reduced shape: {X_pca.shape}")

# Step 4: Save PCA-reduced features and target

# Save as .npy files
np.save("chunked_code_embeddings_with_marks_X_pca.npy", X_pca)
np.save("chunked_code_embeddings_with_marks_y.npy", y)

# Optionally, save as CSV too
pd.DataFrame(X_pca).to_csv("X_pca.csv", index=False)
pd.DataFrame(y, columns=["target"]).to_csv("y.csv", index=False)

print("✅ Saved PCA-reduced data to disk.")

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)

# Step 6: Train CatBoost
model = CatBoostRegressor(verbose=0, random_state=42)
model.fit(X_train, y_train)

# Step 7: Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n📊 Evaluation after PCA:")
print(f"  ➤ MSE: {mse:.4f}")
print(f"  ➤ R²: {r2:.4f}")


      Dim_1     Dim_2     Dim_3     Dim_4     Dim_5     Dim_6     Dim_7  \
0 -0.324344  0.153111  0.255822 -0.275432  0.320159 -0.646471  0.181652   
1 -0.324344  0.153111  0.255822 -0.275432  0.320159 -0.646471  0.181652   
2 -0.324344  0.153111  0.255822 -0.275432  0.320159 -0.646471  0.181652   
3 -0.324344  0.153111  0.255822 -0.275432  0.320159 -0.646471  0.181652   
4 -0.264481  0.141414  0.268043 -0.253449  0.377265 -0.636789  0.225974   

      Dim_8     Dim_9    Dim_10  ...  Dim_3064  Dim_3065  Dim_3066  Dim_3067  \
0  0.240211  0.184584  0.310478  ... -0.013935 -0.324156  0.747561 -0.074969   
1  0.240211  0.184584  0.310478  ... -0.013935 -0.324156  0.747561 -0.074969   
2  0.240211  0.184584  0.310478  ... -0.013935 -0.324156  0.747561 -0.074969   
3  0.240211  0.184584  0.310478  ... -0.015530  0.114970  0.505327 -0.071974   
4  0.175914  0.135474  0.343280  ... -0.013935 -0.324156  0.747561 -0.074969   

   Dim_3068  Dim_3069  Dim_3070  Dim_3071  Dim_3072  Total Marks.1  

In [7]:
# 📦 Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

# Step 1: Load your data from Excel
df = pd.read_excel('/content/code_with_error_embeddings_with_marks.xlsx')  # Use your correct path

# Show the first few rows to inspect the data
print(df.head())

# Step 1.5: Drop non-numeric columns (e.g., code)
# You can specify the exact column name (e.g., 'code') or use df.select_dtypes
df = df.select_dtypes(include=[np.number])  # keep only numeric columns

# Step 2: Extract features and target
X = df.iloc[:, :-1].values  # all numeric columns except last as features
y = df.iloc[:, -1].values   # last numeric column as target

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Step 3: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Apply PCA
n_components = 300  # Adjust depending on input dimension
pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print(f"Original shape: {X.shape}")
print(f"Reduced shape: {X_pca.shape}")

# Step 5: Save PCA-reduced features and target
np.save("code_with_error_embeddings_with_marks_X_pca.npy", X_pca)
np.save("code_with_error_embeddings_with_marks_y.npy", y)

pd.DataFrame(X_pca).to_csv("X_pca.csv", index=False)
pd.DataFrame(y, columns=["target"]).to_csv("y.csv", index=False)

print("✅ Saved PCA-reduced data to disk.")

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)

# Step 7: Train CatBoost
model = CatBoostRegressor(verbose=0, random_state=42)
model.fit(X_train, y_train)

# Step 8: Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n📊 Evaluation after PCA:")
print(f"  ➤ MSE: {mse:.4f}")
print(f"  ➤ R²: {r2:.4f}")


                                       Original_Code         0         1  \
0  \ndef prime_factors(n):\n    factors = []\n   ... -0.121157  0.031700   
1  \ndef prime_factors(n):\n    factors = []\n   ... -0.115394  0.033662   
2  \ndef prime_factors(n):\n    factors = []\n   ... -0.112037  0.028163   
3  \ndef prime_factors(n):\n    factors = []\n   ... -0.146683 -0.003961   
4  \ndef primefactor(n):\n    factors = []\n    d... -0.125924  0.000076   

          2         3         4         5         6         7         8  ...  \
0 -0.080030  0.225850  0.295968 -0.366642  0.319231 -0.068398  0.395103  ...   
1 -0.078795  0.224390  0.293685 -0.363658  0.315770 -0.069983  0.392129  ...   
2 -0.077336  0.221762  0.303193 -0.370837  0.322908 -0.070988  0.399568  ...   
3 -0.084269  0.264436  0.295507 -0.395898  0.367624 -0.095021  0.386096  ...   
4 -0.099540  0.265308  0.308079 -0.372127  0.348484 -0.100154  0.334417  ...   

        759       760       761       762       763       764 