In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define categorical and numerical columns
categorical_cols = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
numerical_cols = ['model_year', 'milage']

# Handle missing values by imputing with most frequent for categorical and median for numerical
imputer_categorical = SimpleImputer(strategy='most_frequent')
imputer_numerical = SimpleImputer(strategy='median')

# Impute the categorical columns
train_data[categorical_cols] = imputer_categorical.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer_categorical.transform(test_data[categorical_cols])

# Impute the numerical columns
train_data[numerical_cols] = imputer_numerical.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer_numerical.transform(test_data[numerical_cols])

# Custom Label Encoder to handle unseen labels
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
        self.classes_ = {}

    def fit(self, X):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
            self.classes_[col] = set(le.classes_)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            X_transformed[col] = X[col].apply(lambda x: self.label_encoders[col].transform([x])[0]
                                              if x in self.classes_[col]
                                              else -1)  # Assign -1 for unseen labels
        return X_transformed

# Apply CustomLabelEncoder to categorical columns
encoder = CustomLabelEncoder()
encoder.fit(train_data[categorical_cols])

train_encoded = encoder.transform(train_data[categorical_cols])
test_encoded = encoder.transform(test_data[categorical_cols])

# Combine encoded categorical data with numerical data
X_train_final = pd.concat([train_encoded, train_data[numerical_cols]], axis=1)
X_test_final = pd.concat([test_encoded, test_data[numerical_cols]], axis=1)

# Split training data into train and validation sets
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_final, train_data['price'], train_size=0.8, random_state=0)

# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train_split, y_train_split)

# Validate the model
preds_valid_final = model.predict(X_valid_split)
mae_final = mean_absolute_error(y_valid_split, preds_valid_final)

print("Mean Absolute Error:", mae_final)

# Predict on the test data
predictions = model.predict(X_test_final)

# Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': predictions
})

# Save to a CSV file
submission.to_csv('submission.csv', index=False)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [7]:
# Initialize lists for categorical and numerical columns
categorical_cols = []
numerical_cols = []

# Loop through the columns and classify them
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        categorical_cols.append(col)
    elif train_data[col].dtype in ['int64', 'float64']:
        numerical_cols.append(col)

# Remove the target variable 'price' from the numerical_cols (only for train data)
if 'price' in numerical_cols:
    numerical_cols.remove('price')

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

# Define categorical and numerical columns
# categorical_cols = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
# numerical_cols = ['model_year', 'milage']

Categorical Columns: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Numerical Columns: ['id', 'model_year', 'milage']


In [8]:
# Handle missing values by imputing with most frequent for categorical and median for numerical
imputer_categorical = SimpleImputer(strategy='most_frequent')
imputer_numerical = SimpleImputer(strategy='median')

In [9]:
# Impute the categorical columns
train_data[categorical_cols] = imputer_categorical.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer_categorical.transform(test_data[categorical_cols])

In [10]:
# Impute the numerical columns
train_data[numerical_cols] = imputer_numerical.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer_numerical.transform(test_data[numerical_cols])

In [11]:
# Custom Label Encoder to handle unseen labels
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
        self.classes_ = {}

    def fit(self, X):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
            self.classes_[col] = set(le.classes_)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            X_transformed[col] = X[col].apply(lambda x: self.label_encoders[col].transform([x])[0]
                                              if x in self.classes_[col]
                                              else -1)  # Assign -1 for unseen labels
        return X_transformed

In [12]:
# Apply CustomLabelEncoder to categorical columns
encoder = CustomLabelEncoder()
encoder.fit(train_data[categorical_cols])

In [14]:
train_encoded = encoder.transform(train_data[categorical_cols])
test_encoded = encoder.transform(test_data[categorical_cols])

In [15]:
# Combine encoded categorical data with numerical data
X_train_final = pd.concat([train_encoded, train_data[numerical_cols]], axis=1)
X_test_final = pd.concat([test_encoded, test_data[numerical_cols]], axis=1)

In [16]:
# Split training data into train and validation sets
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_final, train_data['price'], train_size=0.8, random_state=0)


In [18]:
print(X_train_split.shape,X_valid_split.shape,X_train_final.shape)

(150826, 12) (37707, 12) (188533, 12)


In [17]:
# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train_split, y_train_split)

In [19]:
# Validate the model
preds_valid_final = model.predict(X_valid_split)
mae_final = mean_absolute_error(y_valid_split, preds_valid_final)

In [20]:
print("Mean Absolute Error:", mae_final)

Mean Absolute Error: 22115.079339645163


In [21]:
# Predict on the test data
predictions = model.predict(X_test_final)

In [22]:
# Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': predictions
})

# Save to a CSV file
submission.to_csv('submission.csv', index=False)