In [26]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import xgboost as xgb

In [None]:
df = pd.read_csv("TASK-ML-INTERN.csv")

In [None]:
if 'hsi_id' in df.columns:
    df = df.drop(columns=['hsi_id'])

In [None]:
df['vomitoxin_ppb'] = np.log1p(df['vomitoxin_ppb'])

# Identify spectral feature columns
feature_columns = [col for col in df.columns if col != 'vomitoxin_ppb']

In [None]:
for col in feature_columns:
    lower_bound = df[col].quantile(0.01)
    upper_bound = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower_bound, upper_bound)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feature_columns])
y = df['vomitoxin_ppb'].values

In [None]:
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X_scaled)

In [None]:
loading_matrix = np.abs(pca.components_)

top_features_pc1 = np.argsort(loading_matrix[0])[-150:]
top_features_pc2 = np.argsort(loading_matrix[1])[-150:]
top_features_pc3 = np.argsort(loading_matrix[2])[-150:]
top_features_pc4 = np.argsort(loading_matrix[3])[-150:]

In [None]:
selected_feature_indices = np.unique(np.concatenate([top_features_pc1, top_features_pc2, top_features_pc3, top_features_pc4]))
selected_features = [feature_columns[i] for i in selected_feature_indices]


In [None]:
X_selected = df[selected_features].values
X_selected_scaled = scaler.fit_transform(X_selected)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selected_scaled, y, test_size=0.2, random_state=42)

In [None]:
regressor = xgb.XGBRegressor(n_estimators=900, learning_rate=0.01, max_depth=16, random_state=42)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)


In [None]:
from sklearn.model_selection import GridSearchCV

In [27]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")
print(f"MAE: {mae}")

RMSE: 2.634929838060914
R² Score: 0.14479577030870883
MAE: 1.9526275463211038
