In [27]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [28]:
# Load datasets
dataset_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/train.csv?raw=true'
df = pd.read_csv(dataset_url)
test_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/test.csv?raw=true'
dt = pd.read_csv(test_url)
sample_submission_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/sample_submission.csv?raw=true'
sf = pd.read_csv(sample_submission_url)

In [29]:
current_year = datetime.now().year
car_brands = ['MINI', 'Lincoln', 'Chevrolet', 'Genesis', 'Mercedes-Benz', 'Audi', 'Ford', 'BMW', 'Cadillac', 'Land',
              'GMC', 'Toyota', 'Hyundai', 'Volvo', 'Volkswagen', 'Buick', 'Rivian', 'Hummer', 'Alfa', 'INFINITI',
              'Jeep', 'Porsche', 'Honda', 'Lexus', 'Dodge', 'Nissan', 'Jaguar', 'Kia', 'Mitsubishi', 'Rolls-Royce',
              'Maserati', 'Pontiac', 'Saturn', 'Bentley', 'Tesla', 'Mazda', 'Subaru', 'Ferrari', 'Aston', 'Acura',
              'Lamborghini', 'Chrysler', 'RAM', 'McLaren', 'Lucid', 'Lotus', 'Scion', 'Plymouth', 'Suzuki', 'FIAT',
              'Saab', 'Bugatti', 'Mercury', 'Karma', 'Maybach', 'Polestar', 'smart']

luxury_cars = ['Lincoln', 'Genesis', 'Mercedes-Benz', 'Audi', 'BMW', 'Cadillac', 'Volvo', 'Porsche', 'Lexus',
               'Jaguar', 'Rolls-Royce', 'Maserati', 'Bentley', 'Tesla', 'Ferrari', 'Aston', 'Acura', 'Lamborghini',
               'McLaren', 'Lucid', 'Lotus', 'Bugatti', 'Karma', 'Maybach', 'Polestar']

transmissions = ['A/T', 'Transmission w/Dual Shift Mode', '7-Speed A/T', '8-Speed A/T', '10-Speed Automatic',
                 '1-Speed A/T', '6-Speed A/T', '10-Speed A/T', '9-Speed A/T', '8-Speed Automatic', '9-Speed Automatic',
                 '5-Speed A/T', 'Automatic', '7-Speed Automatic with Auto-Shift', 'CVT Transmission', '5-Speed M/T',
                 'M/T', '6-Speed M/T', '6-Speed Automatic', '4-Speed Automatic', '7-Speed M/T', '2-Speed A/T',
                 '1-Speed Automatic', 'Automatic CVT', '4-Speed A/T', '6-Speed Manual', 'Transmission Overdrive Switch',
                 '8-Speed Automatic with Auto-Shift', '7-Speed Manual', '7-Speed Automatic', '9-Speed Automatic with Auto-Shift',
                 '6-Speed Automatic with Auto-Shift', '6-Speed Electronically Controlled Automatic with O', 'F', 'CVT-F',
                 '8-Speed Manual', 'Manual', '–', '2', '6 Speed At/Mt', '5-Speed Automatic', '2-Speed Automatic',
                 '8-SPEED A/T', '7-Speed', 'Variable', 'Single-Speed Fixed Gear', '8-SPEED AT', '10-Speed Automatic with Overdrive',
                 '7-Speed DCT Automatic', 'SCHEDULED FOR OR IN PRODUCTION', '6-Speed', '6 Speed Mt']

automatic_transmissions = [trans for trans in transmissions if 'Automatic' in trans or 'CVT' in trans]

fuel_types = ['Gasoline', 'E85 Flex Fuel', np.nan, 'Hybrid', 'Diesel', 'Plug-In Hybrid', '–', 'not supported']

fuel_dict = {
    'Gasoline': 1,
    'E85 Flex Fuel': 3,
    'Hybrid': 5,
    'Diesel': 2,
    'Plug-In Hybrid': 4,
    '–': 0,
    'not supported': 0,
    np.nan: 0
}

clean_title_dict = {
    'Yes': 1,
    np.nan: 0
}

engines = [
    '172.0 HP 1.6L 4 Cylinder Engine Gasoline Fuel',
    '252.0 HP 3.9L 8 Cylinder Engine Gasoline Fuel',
    '320.0 HP 5.3L 8 Cylinder Engine Flex Fuel Capability',
    '78.0 HP 1.2L 3 Cylinder Engine Gasoline Fuel',
    '139.0 HP 1.6L 4 Cylinder Engine Plug-In Electric/Gas',
    '313.0 HP 2.0L 4 Cylinder Engine Plug-In Electric/Gas',
    'Electric Engine',
    'Hybrid Engine'
]

engine_dict = {
    '172.0 HP 1.6L 4 Cylinder Engine Gasoline Fuel': 2,
    '252.0 HP 3.9L 8 Cylinder Engine Gasoline Fuel': 3,
    '320.0 HP 5.3L 8 Cylinder Engine Flex Fuel Capability': 4,
    '78.0 HP 1.2L 3 Cylinder Engine Gasoline Fuel': 1,
    '139.0 HP 1.6L 4 Cylinder Engine Plug-In Electric/Gas': 5,
    '313.0 HP 2.0L 4 Cylinder Engine Plug-In Electric/Gas': 6,
    'Electric': 7,
    'Hybrid': 8
}

# Dictionaries
transmission_dict = {trans: 1 if trans in automatic_transmissions else 0 for trans in transmissions}
car_dict = {brand: 1 if brand in luxury_cars else 0 for brand in car_brands}

# Data Preparation
df['engineRank'] = df['engine'].map(engine_dict).fillna(0).astype(int)
df['cleanTitleValue'] = df['clean_title'].map(clean_title_dict).astype(int)
df['luxury_brand'] = df['brand'].map(car_dict).astype(int)
df['fuelValue'] = df['fuel_type'].map(fuel_dict).astype(int)
df['isAutomatic'] = df['transmission'].map(transmission_dict).astype(int)
df['model_year'] = df['model_year'].astype(int)
df['car_age'] = current_year - df['model_year']

dt['engineRank'] = dt['engine'].map(engine_dict).fillna(0).astype(int)
dt['cleanTitleValue'] = dt['clean_title'].map(clean_title_dict).astype(int)
dt['luxury_brand'] = dt['brand'].map(car_dict).astype(int)
dt['fuelValue'] = dt['fuel_type'].map(fuel_dict).astype(int)
dt['isAutomatic'] = dt['transmission'].map(transmission_dict).astype(int)
dt['model_year'] = dt['model_year'].astype(int)
dt['car_age'] = current_year - dt['model_year']

In [30]:
drop_col = ['model', 'id', 'brand', 'ext_col', 'int_col', 'accident', 'engine', 'transmission', 'clean_title']
df.drop(columns=drop_col, inplace=True)
dt.drop(columns=drop_col, inplace=True)

In [31]:
y = df['price']
x = df.drop(['price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [32]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_mse = mean_squared_error(y_train, y_pred_train, squared=False)
test_mse = mean_squared_error(y_test, y_pred_test, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R2 Score: {train_r2}")
print(f"Testing R2 Score: {test_r2}")

ValueError: could not convert string to float: 'E85 Flex Fuel'

In [None]:
id = sf.pop('id')
y_pred = model.predict(dt)

In [None]:
submission_df = pd.DataFrame({'id': id, 'class': y_pred})
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")