# POLYNOMIAL REGRESSION MODEL

## Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

## Import Dataset

In [2]:
train_dataset = pd.read_csv('train.csv')
X_train = train_dataset.iloc[:, 1:-1].values 
y_train = train_dataset.iloc[:, -1].values

print(X_train)
print(y_train)

[[25.          0.5         0.25       ...  0.4250109   0.41754541
  32.46088718]
 [25.          0.5         0.25       ...  0.44490828  0.42205139
  33.85831713]
 [12.5         0.25        0.25       ...  0.55292683  0.47085288
  38.34178123]
 ...
 [25.          0.5         0.25       ...  0.4225477   0.41678557
  32.29905902]
 [25.          0.5         0.25       ...  0.54217024  0.4341325
  36.6742427 ]
 [25.          0.5         0.25       ...  0.49207686  0.44657579
  35.09473308]]
[4476.81146 5548.12201 6869.7776  ... 4696.44394 6772.93347 5867.99722]


## Splitting Dataset

In [3]:
from sklearn.model_selection import train_test_split
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

## Model Training

In [4]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=1)),  # Try with a simpler linear model
    ('ridge_regressor', Ridge(alpha=1.0))  # Regularization strength; can be tuned
])

pipeline.fit(X_train_split, y_train_split)

## Predict values

In [5]:
y_val_pred = pipeline.predict(X_val)

## Evaluation

In [6]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Validation Mean Squared Error: {mse}")
print(f"Validation R-squared: {r2}")


Validation Mean Squared Error: 325773.6837568638
Validation R-squared: 0.8142680547904189


# Predict using Test Set

## Load Dataset

In [7]:
test_dataset = pd.read_csv('test.csv')
X_test = test_dataset.iloc[:, 1:].values  

## Predict Values

In [8]:
y_test_pred = pipeline.predict(X_test)

predicted_yield_with_id = pd.DataFrame({
    'id': test_dataset.iloc[:, 0],  
    'PredictedYield': y_test_pred
})

## Display results

In [9]:
print(predicted_yield_with_id)

          id  PredictedYield
0      15289     4319.144901
1      15290     6087.877028
2      15291     7258.158571
3      15292     4983.521880
4      15293     3676.815141
...      ...             ...
10189  25478     5406.551016
10190  25479     5781.252989
10191  25480     6956.065629
10192  25481     4441.697472
10193  25482     7332.698211

[10194 rows x 2 columns]


## Save Results