In [2]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna


# Suppressing warnings to avoid clutter in output
warnings.filterwarnings('ignore')



In [3]:
train=pd.read_csv('/kaggle/input/fsajhd/df_train_clean.csv')
test=pd.read_csv('/kaggle/input/fsajhd/df_test_clean.csv')
sub=pd.read_csv('/kaggle/input/fsajhd/sample_submission.csv')

In [4]:
train.head(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Horsepower,Displacement,Cylinder Count,model_age
0,31,213000,2,0,312,71,2,1,4200,172.0,1.6,4.0,18
1,28,143250,2,0,263,10,0,1,4999,252.0,3.9,8.0,23
2,9,136731,1,0,38,71,2,1,13900,320.0,5.3,8.0,23
3,16,19500,2,2,29,14,2,1,45000,420.0,5.0,8.0,8
4,36,7388,2,0,29,10,2,1,97500,208.0,2.0,4.0,4


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [6]:
test.sample(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,Horsepower,Displacement,Cylinder Count,model_age
61984,36,43000,2,0,29,14,2,1,503.0,4.0,8.0,8
22308,9,8400,2,0,128,14,2,1,455.0,6.2,8.0,2
11599,14,38020,2,0,128,57,2,0,297.584798,3.0,6.374268,5
23780,27,7652,2,0,12,122,2,0,387.738425,4.7,8.0,4
88332,11,245000,2,0,38,71,0,1,210.0,3.7,6.0,19


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [9]:
import numpy as np
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer


# Define features and target
X_train = train.drop(columns=["price"])
y_train = train["price"]
X_test = test.copy()

# Identify numerical columns
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features)
])

# Ridge hyperparameter grid
param_grid = {
    "model__alpha": [0.01, 0.1, 1, 10, 100]
}

# Initialize KFold
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X_train))
oof_df = pd.DataFrame(columns=["ID", "Actual", "OOF_Pred_Ridge", "Fold"])

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train), start=1):
    print(f"\n Training Fold {fold}/{n_folds}...")
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", Ridge())
    ])

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_squared_error")
    grid_search.fit(X_tr, y_tr)
    best_model = grid_search.best_estimator_

    y_val_pred = best_model.predict(X_val)
    oof_preds[val_idx] = y_val_pred

    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f" Fold {fold} RMSE: {fold_rmse:.4f}")

    fold_df = pd.DataFrame({
        "ID": X_train.index[val_idx],
        "Actual": y_val.values,
        "OOF_Pred_Ridge": y_val_pred,
        "Fold": fold
    })
    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall OOF RMSE
oof_rmse = mean_squared_error(y_train, oof_preds, squared=False)
print(f"\n Overall OOF RMSE: {oof_rmse:.4f}")

oof_df.to_csv("oof_predictions_ridge.csv", index=False)
print(" OOF predictions saved to 'oof_predictions_ridge.csv'.")

# Train final model on full training set
final_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge(alpha=grid_search.best_params_["model__alpha"]))
])

final_model.fit(X_train, y_train)
joblib.dump(final_model, "ridge_model.pkl")
print(" Model saved as 'ridge_model.pkl'")

# Make predictions on test set
test_predictions = final_model.predict(X_test)
sub["price"] = test_predictions
sub.to_csv("ridge_submission.csv", index=False)
print(" Submission saved as 'ridge_submission.csv'")



 Training Fold 1/5...
 Fold 1 RMSE: 70272.1555

 Training Fold 2/5...
 Fold 2 RMSE: 70578.1957

 Training Fold 3/5...
 Fold 3 RMSE: 75664.7562

 Training Fold 4/5...
 Fold 4 RMSE: 78497.1164

 Training Fold 5/5...
 Fold 5 RMSE: 78294.3942

 Overall OOF RMSE: 74748.1006
 OOF predictions saved to 'oof_predictions_ridge.csv'.
 Model saved as 'ridge_model.pkl'
 Submission saved as 'ridge_submission.csv'
