In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
X = train_data.drop(columns=['price'], axis=1)
y = train_data['price']

In [4]:
X['fuel_type'] = X['fuel_type'].fillna(X['fuel_type'].mode()[0])
test_data['fuel_type'] = test_data['fuel_type'].fillna(test_data['fuel_type'].mode()[0])

X['accident'] = X['accident'].fillna(X['accident'].mode()[0])
test_data['accident'] = test_data['accident'].fillna(test_data['accident'].mode()[0])

X['clean_title'] = X['clean_title'].fillna('Unknown')
test_data['clean_title'] = test_data['clean_title'].fillna('Unknown')

In [5]:
train_id = X['id']
test_id = test_data['id']

In [6]:
X = X.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [7]:
# Frequency Encoding for brand, model column

columns_for_freq_encod = ['brand', 'model', 'transmission', 'engine']

for col in columns_for_freq_encod:
    freq_encod = X[col].value_counts()
    X[col] = X[col].map(freq_encod.fillna(0))
    test_data[col] = test_data[col].map(freq_encod.fillna(0))
    test_data[col] = test_data[col].astype(float)

In [8]:
scaler = MinMaxScaler()

In [9]:
columns_for_scaling = ['brand', 'model', 'model_year', 'milage', 'transmission', 'engine']

X[columns_for_scaling] = scaler.fit_transform(X[columns_for_scaling])
test_data[columns_for_scaling] = scaler.transform(test_data[columns_for_scaling])

In [10]:
# One-Hot encoding for fuel_type, ext_col, int_col

columns_for_one_hot_encoding = ['fuel_type', 'ext_col', 'int_col', 'accident', 'clean_title']

for column in columns_for_one_hot_encoding:
    X = pd.get_dummies(X, columns=[column], drop_first=True, dtype=int)
    test_data = pd.get_dummies(test_data, columns=[column], drop_first=True, dtype=int)

In [11]:
for col in X.columns:
    if col not in test_data.columns:
        test_data[col] = 0

test_data = test_data[X.columns]


In [12]:
X['id'] = train_id
test_data['id'] = test_id

In [13]:
# Splitting data 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestRegressor

In [15]:
# early_stopping = EarlyStopping(monitor='val_loss', patience=10)

In [16]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=2)
rf_model.fit(X_train, y_train)

In [17]:
# model = Sequential([
#     Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.6),
#     Dense(32, activation='relu'),
#     Dropout(0.6),
#     Dense(1)
# ])

In [18]:
# model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

In [19]:
# history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1, callbacks=[early_stopping])

In [20]:
# # y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)
# print(rmse)

In [21]:
# y_test_pred = model.predict(test_data)

In [26]:
y_pred_rf = rf_model.predict(test_data)
# mse = mean_squared_error(y_test, y_pred_rf)
# rmse = np.sqrt(mse)
# print(rmse)

In [None]:
# import matplotlib.pyplot as plt
# 
# plt.plot(history.history['loss'], label='Train Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

In [27]:
submission = pd.DataFrame({
    'id' : test_data['id'],
    'price' : y_pred_rf.flatten()
})

submission.to_csv('submission.csv', index=False)