In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define categorical and numerical columns
categorical_cols = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
numerical_cols = ['model_year', 'milage']

# Handle missing values by imputing with most frequent for categorical and median for numerical
imputer_categorical = SimpleImputer(strategy='most_frequent')
imputer_numerical = SimpleImputer(strategy='median')

# Impute the categorical columns
train_data[categorical_cols] = imputer_categorical.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer_categorical.transform(test_data[categorical_cols])

# Impute the numerical columns
train_data[numerical_cols] = imputer_numerical.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer_numerical.transform(test_data[numerical_cols])

# Custom Label Encoder to handle unseen labels
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
        self.classes_ = {}

    def fit(self, X):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
            self.classes_[col] = set(le.classes_)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            X_transformed[col] = X[col].apply(lambda x: self.label_encoders[col].transform([x])[0]
                                              if x in self.classes_[col]
                                              else -1)  # Assign -1 for unseen labels
        return X_transformed

# Apply CustomLabelEncoder to categorical columns
encoder = CustomLabelEncoder()
encoder.fit(train_data[categorical_cols])

train_encoded = encoder.transform(train_data[categorical_cols])
test_encoded = encoder.transform(test_data[categorical_cols])

# Combine encoded categorical data with numerical data
X_train_final = pd.concat([train_encoded, train_data[numerical_cols]], axis=1)
X_test_final = pd.concat([test_encoded, test_data[numerical_cols]], axis=1)

# Split training data into train and validation sets
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_final, train_data['price'], train_size=0.8, random_state=0)

# Initialize regression models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=0),
    # "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=0)
}

# Train and validate each model
for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    preds_valid = model.predict(X_valid_split)
    mae = mean_absolute_error(y_valid_split, preds_valid)
    print(f"{name} MAE: {mae}")

# Choose the best model (let's assume Random Forest performed the best)
best_model = models["Random Forest"]

# Predict on the test data with the best model
predictions = best_model.predict(X_test_final)

# Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': predictions
})

# Save to a CSV file
submission.to_csv('submission.csv', index=False)


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-7e890db4f54a>", line 57, in <cell line: 57>
    train_encoded = encoder.transform(train_data[categorical_cols])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "<ipython-input-16-7e890db4f54a>", line 48, in transform
    X_transformed[col] = X[col].apply(lambda x: self.label_encoders[col].transform([x])[0]
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/series.py", line 4764, in apply
    ).apply()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1209, in apply
    return self.apply_standard()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1289, in apply_standard
    mapped = obj._map_values(
  Fil

TypeError: object of type 'NoneType' has no len()

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor

In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Initialize lists for categorical and numerical columns
categorical_cols = []
numerical_cols = []

# Loop through the columns and classify them
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        categorical_cols.append(col)
    elif train_data[col].dtype in ['int64', 'float64']:
        numerical_cols.append(col)

# Remove the target variable 'price' from the numerical_cols (only for train data)
if 'price' in numerical_cols:
    numerical_cols.remove('price')

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

# Define categorical and numerical columns
# categorical_cols = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
# numerical_cols = ['model_year', 'milage']

Categorical Columns: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Numerical Columns: ['id', 'model_year', 'milage']


In [4]:
# Handle missing values by imputing with most frequent for categorical and median for numerical
imputer_categorical = SimpleImputer(strategy='most_frequent')
imputer_numerical = SimpleImputer(strategy='median')

In [5]:
# Impute the categorical columns
train_data[categorical_cols] = imputer_categorical.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer_categorical.transform(test_data[categorical_cols])

In [6]:
# Impute the numerical columns
train_data[numerical_cols] = imputer_numerical.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer_numerical.transform(test_data[numerical_cols])

In [7]:
# Custom Label Encoder to handle unseen labels
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {}
        self.classes_ = {}

    def fit(self, X):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
            self.classes_[col] = set(le.classes_)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            X_transformed[col] = X[col].apply(lambda x: self.label_encoders[col].transform([x])[0]
                                              if x in self.classes_[col]
                                              else -1)  # Assign -1 for unseen labels
        return X_transformed

In [8]:
# Apply CustomLabelEncoder to categorical columns
encoder = CustomLabelEncoder()
encoder.fit(train_data[categorical_cols])

In [9]:
train_encoded = encoder.transform(train_data[categorical_cols])
test_encoded = encoder.transform(test_data[categorical_cols])

In [10]:
# Combine encoded categorical data with numerical data
X_train_final = pd.concat([train_encoded, train_data[numerical_cols]], axis=1)
X_test_final = pd.concat([test_encoded, test_data[numerical_cols]], axis=1)

In [11]:
# Split training data into train and validation sets
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_final, train_data['price'], train_size=0.8, random_state=0)


In [12]:
# Initialize regression models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=0),
    # "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=0)
}

In [13]:
# Train and validate each model
for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    preds_valid = model.predict(X_valid_split)
    mae = mean_absolute_error(y_valid_split, preds_valid)
    print(f"{name} MAE: {mae}")


Linear Regression MAE: 23354.277775810704
Decision Tree MAE: 29635.476330654787
Random Forest MAE: 22115.079339645163


In [18]:
model_1=GradientBoostingRegressor(n_estimators=100,random_state=0)

In [20]:
model_1.fit(X_train_split,y_train_split)
predict_valid=model_1.predict(X_valid_split)
mae_1=mean_absolute_error(y_valid_split,predict_valid)
print(mae_1)

19819.2263993601


In [24]:
best_model=model_1

In [25]:

# Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': predictions
})

# Save to a CSV file
submission.to_csv('submission.csv', index=False)