In [1]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna


# Suppressing warnings to avoid clutter in output
warnings.filterwarnings('ignore')



In [2]:
train=pd.read_csv('/kaggle/input/fsajhd/df_train_clean.csv')
test=pd.read_csv('/kaggle/input/fsajhd/df_test_clean.csv')

In [3]:
tarain=pd.read_csv('/kaggle/input/fsajhd/train.csv')

In [4]:
train.info(
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [5]:
tarain.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [6]:
train.head(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Horsepower,Displacement,Cylinder Count,model_age
0,31,213000,2,0,312,71,2,1,4200,172.0,1.6,4.0,18
1,28,143250,2,0,263,10,0,1,4999,252.0,3.9,8.0,23
2,9,136731,1,0,38,71,2,1,13900,320.0,5.3,8.0,23
3,16,19500,2,2,29,14,2,1,45000,420.0,5.0,8.0,8
4,36,7388,2,0,29,10,2,1,97500,208.0,2.0,4.0,4


In [7]:
test.sample(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,Horsepower,Displacement,Cylinder Count,model_age
37234,50,12700,2,0,263,14,2,1,260.0,2.4,4.0,3
43345,9,73846,2,0,290,79,2,1,419.557352,5.3,6.374268,6
78713,28,53649,2,0,304,10,2,1,375.0,3.5,6.0,7
86683,26,72200,2,2,29,14,2,1,340.0,3.0,6.0,7
7219,34,57550,2,0,271,14,2,1,271.069025,2.5,6.374268,5


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [9]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Assume your DataFrame 'train' is already loaded
X = train.drop(columns=['price'])
y = train['price']

# Define preprocessing for numeric features
num_pipeline = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, ['milage', 'Horsepower', 'Displacement', 'Cylinder Count', 'model_age'])
    ],
    remainder='passthrough'
)

# Create a pipeline with the preprocessor and Lasso model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Lasso())
])

# Define the parameter grid for the Lasso alpha hyperparameter
param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # Negative RMSE; higher is better
    cv=5,
    n_jobs=-1
)

# Fit GridSearchCV to the full dataset
grid_search.fit(X, y)

# Output the best hyperparameters and corresponding RMSE
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)


Best parameters: {'model__alpha': 10}
Best RMSE: 74656.33777515459


In [10]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Load dataset (ensure that 'train' DataFrame is defined)
X = train.drop(columns=['price'])
y = train['price']

# Define preprocessing pipeline for numeric features
num_pipeline = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, ['milage', 'Horsepower', 'Displacement', 'Cylinder Count', 'model_age'])
    ],
    remainder='passthrough'
)

# Wrap the Lasso model with TransformedTargetRegressor to apply log-transform on the target.
# The regressor applies np.log on y during training and np.exp on predictions.
base_model = TransformedTargetRegressor(
    regressor=Lasso(),  # Lasso model whose alpha will be tuned
    func=np.log,
    inverse_func=np.exp
)

# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', base_model)
])

# Define parameter grid for tuning the Lasso alpha parameter
param_grid = {
    'model__regressor__alpha': [0.001, 0.01, 0.1, 1, 10]
}

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # negative RMSE: higher is better
    cv=5,
    n_jobs=-1
)

# Fit GridSearchCV on the full data
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

# Use the best estimator from grid search
best_pipeline = grid_search.best_estimator_

# Perform 5-fold cross-validation to generate out-of-fold (OOF) predictions using the best model
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_Lasso', 'Fold'])

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
    print(f"\nTraining Fold {fold}/{n_folds} with best parameters...")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train best pipeline on the training fold
    best_pipeline.fit(X_tr, y_tr)
    
    # Predict on the validation fold (predictions are automatically converted back to original scale)
    y_val_pred = best_pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred
    
    # Compute fold RMSE on the original scale
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"Fold {fold} RMSE: {fold_rmse:.4f}")
    
    # Store fold results in a DataFrame
    fold_df = pd.DataFrame({
        'ID': X.index[val_idx],
        'Actual': y_val.values,
        'OOF_Pred_Lasso': y_val_pred,
        'Fold': fold
    })
    
    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall OOF RMSE on the original scale
overall_rmse = mean_squared_error(y, oof_preds, squared=False)
print(f"\nOverall OOF RMSE: {overall_rmse:.4f}")

# Save OOF predictions to CSV
oof_df.to_csv('oof_predictions_lasso.csv', index=False)
print("OOF predictions saved to 'oof_predictions_lasso.csv'.")

# Train final model on the full dataset using the best parameters (with target log-transform)
final_pipeline = best_pipeline
final_pipeline.fit(X, y)
joblib.dump(final_pipeline, 'lasso_model.pkl')
print("Final Lasso model (with log-transform and tuned alpha) trained and saved as 'lasso_model.pkl'.")


Best parameters: {'model__regressor__alpha': 0.001}
Best RMSE: 74972.75939573557

Training Fold 1/5 with best parameters...
Fold 1 RMSE: 70514.1814

Training Fold 2/5 with best parameters...
Fold 2 RMSE: 70817.4735

Training Fold 3/5 with best parameters...
Fold 3 RMSE: 75979.2169

Training Fold 4/5 with best parameters...
Fold 4 RMSE: 78851.0176

Training Fold 5/5 with best parameters...
Fold 5 RMSE: 78710.0635

Overall OOF RMSE: 75063.8828
OOF predictions saved to 'oof_predictions_lasso.csv'.
Final Lasso model (with log-transform and tuned alpha) trained and saved as 'lasso_model.pkl'.


In [11]:
# Use the final_pipeline (tuned and trained on the full dataset) to predict on the test set
final_test_predictions = final_pipeline.predict(test)

# Load the sample submission file
sub = pd.read_csv('/kaggle/input/fsajhd/sample_submission.csv')
sub['price'] = final_test_predictions

# Save the submission file
sub.to_csv('lasso_submission.csv', index=False)
print("Submission file saved as 'lasso_submission.csv'.")
sub.head()


Submission file saved as 'lasso_submission.csv'.


Unnamed: 0,id,price
0,188533,18864.552693
1,188534,55035.234093
2,188535,52694.780475
3,188536,27492.097247
4,188537,27782.663088
