In [11]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna


# Suppressing warnings to avoid clutter in output
warnings.filterwarnings('ignore')



In [12]:
train=pd.read_csv('/kaggle/input/fsajhd/df_train_clean.csv')
test=pd.read_csv('/kaggle/input/fsajhd/df_test_clean.csv')

In [13]:
tarain=pd.read_csv('/kaggle/input/fsajhd/train.csv')

In [14]:
tarain.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [15]:
train.head(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Horsepower,Displacement,Cylinder Count,model_age
0,31,213000,2,0,312,71,2,1,4200,172.0,1.6,4.0,18
1,28,143250,2,0,263,10,0,1,4999,252.0,3.9,8.0,23
2,9,136731,1,0,38,71,2,1,13900,320.0,5.3,8.0,23
3,16,19500,2,2,29,14,2,1,45000,420.0,5.0,8.0,8
4,36,7388,2,0,29,10,2,1,97500,208.0,2.0,4.0,4


In [16]:
test.sample(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,Horsepower,Displacement,Cylinder Count,model_age
83149,25,5858,2,0,128,14,2,1,729.0,6.5,12.0,12
43519,26,15107,2,0,252,57,2,0,403.647889,5.0,6.374268,2
110291,4,37000,2,2,234,14,2,1,600.0,4.4,8.0,7
106090,36,91900,2,0,29,14,0,1,268.0,3.5,6.0,15
57778,27,51335,2,0,38,14,0,1,386.0,4.6,8.0,6


In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [19]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset
X = train.drop(columns=['price'])
y = train['price']

# Number of folds for cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_LinearRegression', 'Fold'])

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
    print(f"\nTraining Fold {fold}/{n_folds}...")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Define preprocessing pipeline
    num_pipeline = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num_pipeline', num_pipeline, ['milage', 'Horsepower', 'Displacement', 'Cylinder Count', 'model_age'])
        ],
        remainder='passthrough'
    )
    
    # Create pipeline with preprocessor and Linear Regression model wrapped in TransformedTargetRegressor.
    # The model applies np.log to the target during training and np.exp during prediction.
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', TransformedTargetRegressor(
            regressor=LinearRegression(),
            func=np.log,          # Transform target using log
            inverse_func=np.exp   # Inverse transformation to get back to original scale
        ))
    ])
    
    # Train model
    pipeline.fit(X_tr, y_tr)
    
    # Predict on validation set (predictions are automatically converted back to original scale)
    y_val_pred = pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred
    
    # Compute fold RMSE on the original scale
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"Fold {fold} RMSE: {fold_rmse:.4f}")
    
    # Store fold results in DataFrame
    fold_df = pd.DataFrame({
        'ID': X.index[val_idx],
        'Actual': y_val.values,
        'OOF_Pred_LinearRegression': y_val_pred,
        'Fold': fold
    })
    
    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall RMSE on the original scale
oof_rmse = mean_squared_error(y, oof_preds, squared=False)
print(f"\nOverall OOF RMSE: {oof_rmse:.4f}")

# Save OOF predictions
oof_df.to_csv('oof_predictions_linear_regression.csv', index=False)
print("OOF predictions saved to 'oof_predictions_linear_regression.csv'.")

# 🔥 Train final Linear Regression model on full dataset with target log-transform
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', TransformedTargetRegressor(
        regressor=LinearRegression(),
        func=np.log,
        inverse_func=np.exp
    ))
])

final_pipeline.fit(X, y)  # Train on full dataset
joblib.dump(final_pipeline, 'linear_regression_model.pkl')  # Save final trained model
print("Final Linear Regression model trained and saved as 'linear_regression_model.pkl'.")



Training Fold 1/5...
Fold 1 RMSE: 70501.5076

Training Fold 2/5...
Fold 2 RMSE: 70803.6267

Training Fold 3/5...
Fold 3 RMSE: 75961.1240

Training Fold 4/5...
Fold 4 RMSE: 78839.3581

Training Fold 5/5...
Fold 5 RMSE: 78695.8279

Overall OOF RMSE: 75049.7913
OOF predictions saved to 'oof_predictions_linear_regression.csv'.
Final Linear Regression model trained and saved as 'linear_regression_model.pkl'.


In [20]:
final_test_predictions = pipeline.predict(test)

# Save submission file
sub = pd.read_csv('/kaggle/input/fsajhd/sample_submission.csv')
sub['price'] = final_test_predictions
sub.to_csv('LinearRegression_submission.csv', index=False)
print("Submission file saved as 'LinearRegression.csv'.")
sub.head()


Submission file saved as 'LinearRegression.csv'.


Unnamed: 0,id,price
0,188533,18873.224154
1,188534,54959.146113
2,188535,53306.471053
3,188536,27865.128099
4,188537,27759.862214
