In [3]:
#Libraries for Numeric analysis
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts

#Libraries for Visual representation of data
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Library to remove uncecessary waarnings messages.
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
#Load and preprocess data
data=pd.read_csv('processed.csv')
data_cleaned=data.drop(columns=["C40"])  # Drop the null column
X=data_cleaned.drop(columns=["PPPF"])
y=data_cleaned["PPPF"]
#Define pipeline components
pipeline=Pipeline([
    ('feature_selection', SelectKBest(score_func=f_regression)),
    ('poly_features', PolynomialFeatures(include_bias=False)),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

#Setting up parameter grid for GridSearchCV
param_grid={
    'feature_selection__k': [ 25, 30, 35],  # Number of top features to select
    'poly_features__degree': [2,3]             # Polynomial degrees to test
}

#Defining scoring metric (MSE)
scorer=make_scorer(mean_squared_error, greater_is_better=False)

#Performing GridSearchCV
grid_search=GridSearchCV(pipeline, param_grid, scoring=scorer, cv=3, verbose=1)
grid_search.fit(X, y)

#Display results of GridSearchCV
best_params=grid_search.best_params_
best_model=grid_search.best_estimator_
best_score=grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validated MSE:", -best_score)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Parameters: {'feature_selection__k': 30, 'poly_features__degree': 3}
Best Cross-Validated MSE: 0.004113415557785522
