In [1]:
import numpy as np
import pandas as pd
import csv
import re

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 1st Linear regression

In [4]:
def extract_numbers(engine_str):
    numbers = re.findall(r'\d+\.\d+|\d+', engine_str)
    return list(map(float, numbers)) if len(numbers) == 3 else [0, 0, 0]

def parse_transmission(trans_desc):
    # Regex to find the first number
    speed_match = re.search(r'\d+', trans_desc)
    speeds = int(speed_match.group(0)) if speed_match else 0

    # Check if the transmission is manual
    is_manual = 1 if 'manual' in trans_desc.lower() or 'm/t' in trans_desc.lower() or 'mt' in trans_desc.lower() else 0

    return pd.Series([speeds, is_manual], index=['num_speeds', 'is_manual'])



In [5]:
def process_data_lr(df):
    columns_to_exclude = ['brand','model', 'fuel_type', 'ext_col', 'int_col']
    df = df.drop(columns = columns_to_exclude)
    df['model_year'] = df['model_year'].map(lambda x: 2023-x)
    df['accident'] = df['accident'].map(lambda x: 0 if x=="None reported" else 1)
    df['clean_title'] = df['clean_title'].map(lambda x: 1 if x=="Yes" else 0)
    df[['horsepower', 'engine_displacement', 'cylinders']] = df['engine'].apply(lambda x: pd.Series(extract_numbers(x)))
    df = df.drop(columns = ['engine'])
    df[['num_speeds', 'is_manual']] = df['transmission'].apply(lambda x:pd.Series(parse_transmission(x)))
    df = df.drop(columns = ['transmission'])
    return df

In [6]:
ltr = train
lte = test
ltr = process_data_lr(ltr)
lte = process_data_lr(lte)

In [7]:
ltr.head()

Unnamed: 0,id,model_year,milage,accident,clean_title,price,horsepower,engine_displacement,cylinders,num_speeds,is_manual
0,0,16,213000,0,1,4200,172.0,1.6,4.0,0,0
1,1,21,143250,1,1,4999,252.0,3.9,8.0,0,0
2,2,21,136731,0,1,13900,320.0,5.3,8.0,0,0
3,3,6,19500,0,1,45000,420.0,5.0,8.0,0,0
4,4,2,7388,0,1,97500,208.0,2.0,4.0,7,0


In [8]:
ltr.isna().sum()

id                     0
model_year             0
milage                 0
accident               0
clean_title            0
price                  0
horsepower             0
engine_displacement    0
cylinders              0
num_speeds             0
is_manual              0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
X = ltr.drop(columns = ['id','price'])
y = ltr['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1453)

In [11]:
model = LinearRegression()
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Model Coefficients: [-9.64796756e+02 -3.38714313e-01 -6.08575223e+03 -7.50914134e+03
  1.83474450e+01  2.76690582e+03  4.41470430e+02 -8.04636101e+01
  1.27844724e+03]
Intercept: 62253.003393589446
Mean Squared Error: 5831433008.665136
R^2 Score: 0.09025582176074276


In [18]:
y_pred = model.predict(lte.drop(columns = "id"))

sub = pd.DataFrame()
sub=sub.assign(price=y_pred)
sub=sub.assign(id=lte['id'])
sub['price'] = sub['price'].map(lambda x: 1000 if x < 1000 else x)
sub.to_csv('submission.csv', index=False)

In [27]:
model.coef_

array([-9.64796756e+02, -3.38714313e-01, -6.08575223e+03, -7.50914134e+03,
        1.83474450e+01,  2.76690582e+03,  4.41470430e+02, -8.04636101e+01,
        1.27844724e+03])

## 2nd Decision Tree

In [13]:
from sklearn.tree import DecisionTreeRegressor

In [14]:
tree_model = DecisionTreeRegressor(random_state=42)

# Step 4: Train the Model
tree_model.fit(X_train, y_train)

# Step 5: Predict and Evaluate the Model
y_pred = tree_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [15]:
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 10239968986.426182
R^2 Score: -0.597506506018187


## 3nd try, keep improving data cleaning

In [32]:
avg_engine = ltr[ltr['horsepower']>0]

In [42]:
hp = avg_engine['horsepower'].mean()
ed = avg_engine['engine_displacement'].mean()
cy = avg_engine['cylinders'].mean()

In [24]:
print(ltr['horsepower'].mode())
print(ltr['engine_displacement'].mode())
print(ltr['cylinders'].mode())

0    0.0
Name: horsepower, dtype: float64
0    3.0
Name: engine_displacement, dtype: float64
0    6.0
Name: cylinders, dtype: float64


In [34]:
ltr1 = ltr
lte1 = lte

In [41]:
 avg_engine['horsepower'].mean()

293.0146526504549

In [46]:
def replace_null(df):
    df['horsepower'] = df['horsepower'].map(lambda x:hp if x==0 else x)
    df['engine_displacement'] = df['engine_displacement'].map(lambda x: ed if x==0 else x)
    df['cylinders'] = df['cylinders'].map(lambda x: cy if x==0 else x)
    return df

In [47]:
ltr1 = replace_null(ltr1)
lte1 = replace_null(lte1)

In [48]:
X = ltr1.drop(columns = ['id','price'])
y = ltr1['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1453)
lm2 = LinearRegression()
lm2.fit(X_train, y_train)
y_pred = lm2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Model Coefficients: [-9.64796756e+02 -3.38714313e-01 -6.08575223e+03 -7.50914134e+03
  1.83474450e+01  2.76690582e+03  4.41470430e+02 -8.04636101e+01
  1.27844724e+03]
Intercept: 62253.003393589446
Mean Squared Error: 5795829642.719555
R^2 Score: 0.09581019490480291


In [49]:
lm2.fit(X, y)

y_pred = lm2.predict(lte1.drop(columns = "id"))

sub = pd.DataFrame()
sub=sub.assign(price=y_pred)
sub=sub.assign(id=lte['id'])
sub['price'] = sub['price'].map(lambda x: 1000 if x < 1000 else x)
sub.to_csv('submission.csv', index=False)

## 4th XGBoost

In [50]:
import xgboost as xgb

In [51]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100)

In [52]:
xg_reg.fit(X_train, y_train)

# Predict and Evaluate the Model
y_pred = xg_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 5559285858.059507
R^2 Score: 0.13271267336479597


In [54]:
xg_reg.fit(X, y)

y_pred = xg_reg.predict(lte1.drop(columns = "id"))

sub = pd.DataFrame()
sub=sub.assign(price=y_pred)
sub=sub.assign(id=lte['id'])
sub['price'] = sub['price'].map(lambda x: 1000 if x < 1000 else x)
sub.to_csv('submission.csv', index=False)