# Machine Learning Regression Tasks

## 1. Support Vector Regressor (SVR)

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score

# --- CONFIGURATION ---
INPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\dataset_after_only_pca.xlsx"
TARGET_COL = 'domain1_score'

# --- DATA PREPARATION ---
try:
    df = pd.read_excel(INPUT_FILE)
except FileNotFoundError:
    print(f"Error: File not found at {INPUT_FILE}")
    exit()

X = df.drop(columns=['document_number', TARGET_COL])
y = df[TARGET_COL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize Features (Crucial for SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- MODEL TRAINING ---
print("Training Support Vector Regressor (SVR)...")
model = SVR(kernel='rbf', C=10.0, epsilon=0.5, gamma='scale', cache_size=1000) 
model.fit(X_train_scaled, y_train)

# --- PREDICTION & EVALUATION ---
y_pred = model.predict(X_test_scaled)

# Round predictions and clip to the valid score range [2, 12]
y_pred_rounded = np.round(y_pred).astype(int)
y_pred_clipped = np.clip(y_pred_rounded, 2, 12)

# Calculate Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
qwk = cohen_kappa_score(y_test, y_pred_clipped, weights='quadratic')

print("\n--- SVR Regression Results ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Quadratic Weighted Kappa (QWK): {qwk:.4f}")

# --- MODIFICATION: Print Test Set Predictions ---
print("\nTest Set Predictions:")
print(y_pred_clipped[:])
print(f"[Total {len(y_pred_clipped)} predictions]")

Training Support Vector Regressor (SVR)...

--- SVR Regression Results ---
Root Mean Squared Error (RMSE): 1.1008
Mean Absolute Error (MAE): 0.8890
Quadratic Weighted Kappa (QWK): 0.9327

Test Set Predictions:
[ 8  5 10  9  4  2  7  3  2 10  9 10 10  6  7  3 10  4 10  3  8  5  4  6
  9 10  9 10  5]
[Total 29 predictions]


## 2. Gradient Boosting Regressor

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
from sklearn.preprocessing import StandardScaler

# --- CONFIGURATION ---
INPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\dataset_after_only_pca.xlsx"
TARGET_COL = 'domain1_score'

# --- DATA PREPARATION ---
try:
    df = pd.read_excel(INPUT_FILE)
except FileNotFoundError:
    print(f"Error: File not found at {INPUT_FILE}")
    exit()

X = df.drop(columns=['document_number', TARGET_COL])
y = df[TARGET_COL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling is less critical but kept for consistency
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- MODEL TRAINING ---
print("Training Gradient Boosting Regressor...")
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train_scaled, y_train)

# --- PREDICTION & EVALUATION ---
y_pred = model.predict(X_test_scaled)

# Round predictions and clip to the valid score range [2, 12]
y_pred_rounded = np.round(y_pred).astype(int)
y_pred_clipped = np.clip(y_pred_rounded, 2, 12)

# Calculate Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
qwk = cohen_kappa_score(y_test, y_pred_clipped, weights='quadratic')

print("\n--- Gradient Boosting Regressor Results ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Quadratic Weighted Kappa (QWK): {qwk:.4f}")

# --- MODIFICATION: Print Test Set Predictions ---
print("\nTest Set Predictions:")
print(y_pred_clipped[:])
print(f"[Total {len(y_pred_clipped)} predictions]")

Training Gradient Boosting Regressor...

--- Gradient Boosting Regressor Results ---
Root Mean Squared Error (RMSE): 0.9264
Mean Absolute Error (MAE): 0.6134
Quadratic Weighted Kappa (QWK): 0.9595

Test Set Predictions:
[ 8  6 10 10  4  2  7  4  2 11 11 10 11  6  6  2 11  3 11  4  7  6  6  6
  8 12  8  8  6]
[Total 29 predictions]


# Machine Learning Classification Tasks

## 3. Support Vector Machine (SVM or SVC)

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

# --- CONFIGURATION ---
INPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\dataset_after_only_pca.xlsx"
TARGET_COL = 'domain1_score'

# --- DATA PREPARATION ---
try:
    df = pd.read_excel(INPUT_FILE)
except FileNotFoundError:
    print(f"Error: File not found at {INPUT_FILE}")
    exit()

X = df.drop(columns=['document_number', TARGET_COL])
y = df[TARGET_COL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

# Standardize Features (Crucial for SVC)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- MODEL TRAINING ---
print("Training Support Vector Classifier (SVC)...")
model = SVC(kernel='rbf', C=10.0, gamma='scale', random_state=42) 
model.fit(X_train_scaled, y_train)

# --- PREDICTION & EVALUATION ---
y_pred = model.predict(X_test_scaled)

# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted') 
qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')

print("\n--- SVC Classification Results ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print(f"Quadratic Weighted Kappa (QWK): {qwk:.4f}")

# --- MODIFICATION: Print Test Set Predictions ---
print("\nTest Set Predictions:")
print(y_pred[:])
print(f"[Total {len(y_pred)} predictions]")

Training Support Vector Classifier (SVC)...

--- SVC Classification Results ---
Accuracy: 0.5517
F1 Score (Weighted): 0.5123
Quadratic Weighted Kappa (QWK): 0.8537

Test Set Predictions:
[10  8  6  6  4  8 10  6  6  6  2 10  8  6 10  6  6  8  4  6 10  6 10 10
  6  4  6  2  6]
[Total 29 predictions]


## 4. K-Nearest Neighbors (KNN)

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

# --- CONFIGURATION ---
INPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\dataset_after_only_pca.xlsx"
TARGET_COL = 'domain1_score'
K_NEIGHBORS = 5

# --- DATA PREPARATION ---
try:
    df = pd.read_excel(INPUT_FILE)
except FileNotFoundError:
    print(f"Error: File not found at {INPUT_FILE}")
    exit()

X = df.drop(columns=['document_number', TARGET_COL])
y = df[TARGET_COL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

# Standardize Features (Crucial for distance-based models like KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- MODEL TRAINING ---
print(f"Training K-Nearest Neighbors (KNN) Classifier with k={K_NEIGHBORS}...")
model = KNeighborsClassifier(n_neighbors=K_NEIGHBORS) 
model.fit(X_train_scaled, y_train)

# --- PREDICTION & EVALUATION ---
y_pred = model.predict(X_test_scaled)

# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted') 
qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')

print("\n--- KNN Classification Results ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print(f"Quadratic Weighted Kappa (QWK): {qwk:.4f}")

# --- MODIFICATION: Print Test Set Predictions ---
print("\nTest Set Predictions:")
print(y_pred[:])
print(f"[Total {len(y_pred)} predictions]")

Training K-Nearest Neighbors (KNN) Classifier with k=5...

--- KNN Classification Results ---
Accuracy: 0.2414
F1 Score (Weighted): 0.2007
Quadratic Weighted Kappa (QWK): 0.6266

Test Set Predictions:
[ 6  6  6  6  2  6 10  6  6  4  2  6  6  6 12  6  6  8  4  6  8  4  6  8
  6  4  4  2  4]
[Total 29 predictions]
