In [168]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [169]:
df = pd.read_csv("TASK-ML-INTERN.csv")

In [188]:
df[(df['vomitoxin_ppb'] >= -2206.25) & (df['vomitoxin_ppb'] <= 4043.75)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,439,440,441,442,443,444,445,446,447,vomitoxin_ppb
0,0.416181,0.396844,0.408985,0.372865,0.385293,0.365390,0.355226,0.343350,0.344837,0.361567,...,0.710280,0.717482,0.715078,0.705379,0.696691,0.692793,0.711369,0.697679,0.704520,1100.0
1,0.415797,0.402956,0.402564,0.396014,0.397192,0.389634,0.375671,0.363689,0.373883,0.359674,...,0.684011,0.697271,0.701995,0.696077,0.701012,0.677418,0.696921,0.696544,0.689054,1000.0
2,0.389023,0.371206,0.373098,0.373872,0.361056,0.349709,0.333882,0.330841,0.328925,0.323854,...,0.683054,0.669286,0.663179,0.676165,0.676591,0.655951,0.658945,0.670989,0.665176,1300.0
3,0.468837,0.473255,0.462949,0.459335,0.461672,0.459824,0.458194,0.427737,0.415360,0.413106,...,0.742782,0.730801,0.736787,0.730044,0.751437,0.738497,0.742446,0.754657,0.733474,1300.0
4,0.483352,0.487274,0.469153,0.487648,0.464026,0.451152,0.458229,0.440782,0.426193,0.430482,...,0.770227,0.773013,0.761431,0.763488,0.762473,0.744012,0.775486,0.760431,0.751988,220.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.478140,0.444033,0.442120,0.437473,0.428672,0.413238,0.417758,0.420388,0.413290,0.402912,...,0.747858,0.730535,0.716969,0.739297,0.724827,0.720484,0.740626,0.740116,0.721839,1200.0
496,0.409367,0.394941,0.380236,0.375340,0.346122,0.354650,0.361170,0.342974,0.352137,0.327545,...,0.670232,0.659045,0.661587,0.658422,0.644254,0.646479,0.656779,0.646700,0.646733,0.0
497,0.486526,0.501372,0.500175,0.508139,0.489411,0.457311,0.462321,0.462927,0.442647,0.437731,...,0.787532,0.780347,0.768362,0.771411,0.770919,0.761464,0.770314,0.763324,0.797187,0.0
498,0.464595,0.498822,0.489077,0.453381,0.487636,0.461950,0.461671,0.447362,0.451952,0.427200,...,0.739432,0.759722,0.752118,0.761910,0.761111,0.730431,0.753545,0.749619,0.756383,0.0


In [189]:
if 'hsi_id' in df.columns:
    df = df.drop(columns=['hsi_id'])


In [206]:
df['vomitoxin_ppb'] = np.log1p(df['vomitoxin_ppb'])

In [207]:
df['vomitoxin_ppb'] 

0      7.003974
1      6.908755
2      7.170888
3      7.170888
4      5.398163
         ...   
495    7.090910
496    0.000000
497    0.000000
498    0.000000
499    7.244942
Name: vomitoxin_ppb, Length: 500, dtype: float64

In [208]:
feature_columns = [col for col in df.columns if col != 'vomitoxin_ppb']

In [209]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feature_columns])
y = df['vomitoxin_ppb'].values

In [210]:
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X_scaled)

In [211]:
loading_matrix = np.abs(pca.components_)

In [212]:
top_features_pc1 = np.argsort(loading_matrix[0])[-150:]
top_features_pc2 = np.argsort(loading_matrix[1])[-150:]
top_features_pc3 = np.argsort(loading_matrix[2])[-150:]
top_features_pc4 = np.argsort(loading_matrix[3])[-150:]


In [213]:
selected_feature_indices = np.unique(np.concatenate([top_features_pc1, top_features_pc2, top_features_pc3, top_features_pc4]))
selected_features = [feature_columns[i] for i in selected_feature_indices]

In [214]:
selected_features

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [215]:
len(selected_features)

423

In [216]:
X_selected = df[selected_features].values

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [218]:
X_selected

array([[0.41618118, 0.39684365, 0.40898487, ..., 0.71136944, 0.69767889,
        0.7045196 ],
       [0.41579684, 0.40295639, 0.40256357, ..., 0.69692056, 0.69654401,
        0.68905392],
       [0.38902334, 0.37120555, 0.37309768, ..., 0.65894515, 0.6709893 ,
        0.66517639],
       ...,
       [0.48652606, 0.50137243, 0.50017548, ..., 0.77031393, 0.76332389,
        0.79718723],
       [0.46459495, 0.49882201, 0.4890769 , ..., 0.75354539, 0.7496192 ,
        0.75638268],
       [0.46084036, 0.45765625, 0.43463187, ..., 0.70761068, 0.72948366,
        0.71870559]])

In [224]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [239]:
regressor = RandomForestRegressor(n_estimators=900, max_depth=6, random_state=42)
regressor.fit(X_train, y_train)

In [240]:
y_pred = regressor.predict(X_test)

In [241]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

RMSE: 2.665965428341797
R² Score: 0.12453103373042618
