In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from math import nan
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Dataset File

In [None]:
dataset_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/train.csv?raw=true'
df = pd.read_csv(dataset_url)

## Test File

In [None]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/test.csv?raw=true'
dt=pd.read_csv(test_url)

## Sample Submission File

In [None]:
sample_submission_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
dt.info()

In [None]:
df.fillna(0, inplace = True)
df['title_label'] = df['clean_title'].map({'Yes' : 1, 0 : 0})
dt.fillna(0, inplace = True)
dt['title_label'] = dt['clean_title'].map({'Yes' : 1, 0 : 0})
df.sample(5)

In [None]:
df.brand.unique()

LUXURY, AGE, HORSEPOWER

In [None]:
luxury_brands = ['MINI', 'Lincoln', 'Genesis', 'Mercedes-Benz', 'Audi', 'BMW', 'Tesla', 'Cadillac', 'Land Rover', 'Lexus',
                 'Porsche', 'McLaren', 'Rolls-Royce', 'Maserati', 'Bentley', 'Ferrari', 'Aston Martin', 'Lamborghini', 'Lucid', 'Lotus',
                 'Maybach']
df['is_luxury'] = df['brand'].isin(luxury_brands)
dt['is_luxury'] = dt['brand'].isin(luxury_brands)

In [None]:
df['horsepower'] = df['engine'].str.extract('(\d+\.?\d*)HP', expand=False)
dt['horsepower'] = dt['engine'].str.extract('(\d+\.?\d*)HP', expand=False)
df.fillna(0, inplace = True)
dt.fillna(0, inplace = True)
df.sample(10)

In [None]:
df.transmission.unique()
at_arr = ['Automatic', 'A/T', 'AT', 'Single-Speed Fixed Gear']
df['trans_lbl'] = df['transmission'].isin(at_arr)
dt['trans_lbl'] = dt['transmission'].isin(at_arr)
df['trans_lbl'] = df['trans_lbl'].map({True: 1, False: 0})
dt['trans_lbl'] = dt['trans_lbl'].map({True: 1, False: 0})
df.sample(5)

In [None]:
from datetime import datetime

current_year = datetime.now().year
df['car_age'] = current_year - df['model_year']
dt['car_age'] = current_year - dt['model_year']
df.sample(5)

In [None]:
df.accident.unique()

In [None]:
df['fuel_type'] = df['fuel_type'].map({'Gasoline' : 1, 'E85 Flex Fuel' : 2, 0:0, 'Hybrid' : 3, 'Diesel': 4, 'Plug-In Hybrid' : 5, '-' : 0 , 'not supported' : 0})
dt['fuel_type'] = dt['fuel_type'].map({'Gasoline' : 1, 'E85 Flex Fuel' : 2, 0:0, 'Hybrid' : 3, 'Diesel': 4, 'Plug-In Hybrid' : 5, '-' : 0 , 'not supported' : 0})

In [None]:
df['accident'] = df['accident'].map({'None reported' : 0, 'At least 1 accident or damage reported' : 1, 0:0})
dt['accident'] = dt['accident'].map({'None reported' : 0, 'At least 1 accident or damage reported' : 1, 0:0})
dt.sample(5)

In [None]:
#DF!

label_encoder = LabelEncoder()
df['brand_lbl'] = label_encoder.fit_transform(df['brand'])
# df['fuel_lbl'] = label_encoder.fit_transform(df['fuel_type'])
# df['accident_lbl'] = label_encoder.fit_transform(df['accident'])
df['ext_col_lbl'] = label_encoder.fit_transform(df['ext_col'])
df['int_col_lbl'] = label_encoder.fit_transform(df['int_col'])
df = df.drop(['brand', 'id', 'model', 'fuel_type', 'accident', 'engine', 'transmission', 'clean_title', 'ext_col', 'int_col'],axis=1)
df.sample(5)

In [None]:
#DT!

dt['brand_lbl'] = label_encoder.fit_transform(dt['brand'])
# df['fuel_lbl'] = label_encoder.fit_transform(df['fuel_type'])
# df['accident_lbl'] = label_encoder.fit_transform(df['accident'])
dt['ext_col_lbl'] = label_encoder.fit_transform(dt['ext_col'])
dt['int_col_lbl'] = label_encoder.fit_transform(dt['int_col'])

dt = dt.drop(['brand', 'id', 'model', 'fuel_type', 'accident', 'engine', 'transmission', 'clean_title', 'ext_col', 'int_col'],axis=1)
dt.sample(5)

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, center=0)
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
non_boolean_numerical_features = ["model_year",'price','title_label','is_luxury','horsepower','trans_lbl','car_age','brand_lbl', 'ext_col_lbl', 'int_col_lbl']
scaler = MinMaxScaler()
df[non_boolean_numerical_features] = scaler.fit_transform(df[non_boolean_numerical_features])

In [None]:
X_scaled = df.drop(['price','ext_col_lbl', 'int_col_lbl', 'brand_lbl', 'model_year'],axis=1)
y_scaled = df['price']
X_train,X_test,y_train,y_test = train_test_split(X_scaled, y_scaled,test_size=0.4,random_state=42)

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [None]:
y_pred_train = linear_reg.predict(X_train)
y_pred_test = linear_reg.predict(X_test)

In [None]:
train_mse = mean_squared_error(y_train, y_pred_train, squared=False)
test_mse = mean_squared_error(y_test, y_pred_test, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R2 Score: {train_r2}")
print(f"Testing R2 Score: {test_r2}")

In [None]:
sample_submission_url = 'https://github.com/robitussin/CCMACLRL_EXAM/blob/a46a4e2a001dedaefc9b431d480b508ce86c2d96/datasets/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [None]:
dt.info()

In [None]:
df.info()

In [None]:
dt.drop(columns=['brand_lbl', 'ext_col_lbl', 'int_col_lbl', 'model_year'], inplace=True, axis = 1)

In [None]:
id = sf.pop('id')
y_pred = linear_reg.predict(dt)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'class': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")