In [None]:
#Importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import xticks
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.formula.api import ols
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
import statsmodels.api as sm
import warnings
from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#import data
df = pd.DataFrame(pd.read_csv(r'..\Project\CarPrice.csv'))
df.head()

In [None]:
#car_ID is not needed
df.drop(columns=["car_ID"], axis=1, inplace=True)
df.info()
df.describe()

We had 26 columns and 205 rows in the data. we dropped column "Car ID" as it is not needed for analysis

In [None]:
#let's plot the price to see distibution
sns.histplot(df['price'])

from the graph, we can see that distribution of the price  is right-skewed. which suggest that log-linear regression model might  fit better in comparison to simple linear regression

In [None]:
df.CarName.values[0:10]
df['Brand'] = df.CarName.str.split(' ').str.get(0).str.upper()
df['Model'] = df.CarName.str.split(' ').str.get(1).str.upper()
len(set(df.Brand.values))
df.drop(columns=["CarName"], axis=1, inplace=True)
df

In [None]:
df.Brand.unique()


In [None]:
# We have 5 car brand names which is spelled incorrectly. 
Brand_dict = {
    'TOYOUTA': 'TOYOTA',
    'MAXDA': 'MAZDA',
    'PORSCHE': 'PORCSHCE',
    'VOKSWAGEN': 'VOLKSWAGEN',
    'VW': 'VOLKSWAGEN',
}

# Correcting brand name in dataframe
df['Brand'] = df['Brand'].map(Brand_dict).fillna(df['Brand'])
# Let's test again if all corrected
df.Brand.unique()

In [None]:
df['RiskRate'] = df['symboling'].apply(lambda x : "High Risk" if x > 1 
                                                     else ("Medium Risk" if 0 <= x <= 1
                                                        else "Low Risk"))
df

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(80, 60))
fig.subplots_adjust(wspace=0.4, hspace=0.4)

charts = [
    {'title': "Brand Distribution", 'x': 'Brand', 'y': 'price'},
    {'title': "Fuel Type Distribution", 'x': 'fueltype', 'y': 'price'},
    {'title': "Aspiration Distribution", 'x': 'aspiration', 'y': 'price'},
    {'title': "Door Number Distribution", 'x': 'doornumber', 'y': 'price'},
    {'title': "Car Body Distribution", 'x': 'carbody', 'y': 'price'},
    {'title': "Drivewheel Distribution", 'x': 'drivewheel', 'y': 'price'},
    {'title': "Engine Location Distribution", 'x': 'enginelocation', 'y': 'price'},
    {'title': "Engine Type Distribution", 'x': 'enginetype', 'y': 'price'},
    {'title': "Cylinder Number Distribution", 'x': 'cylindernumber', 'y': 'price'},
    {'title': "Fuel System Distribution", 'x': 'fuelsystem', 'y': 'price'},
    {'title': "Risk Rate Distribution", 'x': 'RiskRate', 'y': 'price'}
]

counter = 0

for i in range(3):
    for j in range(3):
        if counter < len(charts):
            chart = charts[counter]
            plt.subplot(3, 3, counter+1)
            plt.title(chart['title'], fontsize=40)
            sns.barplot(x=chart['x'], y=chart['y'], data=df)
            plt.xlabel(chart['x'], fontsize=35)
            plt.ylabel(chart['y'], fontsize=35)
            plt.xticks(fontsize=40, rotation=90)
        else:
            axes[i, j].axis('off')
        counter += 1


plt.rcParams['figure.figsize'] = (190, 180)

plt.show()


In [None]:

columns = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize',
           'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm',
           'citympg', 'highwaympg']


num_plots = len(columns)
num_cols = 3
num_rows = (num_plots - 1) // num_cols + 1


fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 12))
plt.subplots_adjust(hspace=0.8)
plt.subplots_adjust(wspace=0.8)

for i, col in enumerate(columns):
    row = i // num_cols
    col = df[col]  
    
    ax = axes[row, i % num_cols]
    ax.set_title(col.name.capitalize() + " Distribution")
    sns.scatterplot(x=col, y='price', data=df, ax=ax)


if num_plots % num_cols != 0:
    for j in range(num_plots % num_cols, num_cols):
        fig.delaxes(axes[-1, j])


plt.show()


In [None]:
plt.figure(figsize = (16, 10))
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")
plt.show()

As it seems from corellation matrix, there are independent variables which have significant correlation between each other. So using all of them in regression model will mislead the result.


In [None]:
#Outlier treatment of quantitative variables
#listing all the numeric variables in dataframe carprice
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
continues_columns = df.select_dtypes(exclude=['object', 'category']).columns
print("\033[1mCategorical\033[0m")
print(categorical_columns)
print("\033[1mContinues\033[0m")
print(continues_columns)


In [None]:
#checking for outliers in wheelbase column and removing them 
for column in continues_columns:
    # Calculate the lower and upper boundaries for outliers
    lb = df[column].quantile(0.25) - 1.5 * (df[column].quantile(0.75) - df[column].quantile(0.25))
    ub = df[column].quantile(0.75) + 1.5 * (df[column].quantile(0.75) - df[column].quantile(0.25))
    
    # Assign highest and lowest values to outliers
    df.loc[df[column] < lb, column] = lb
    df.loc[df[column] > ub, column] = ub

In [None]:
#checking for any NA induction during outlier treatment
df['price'].isnull().sum() # if output is 0, it means there is not any NaN value

In [None]:
# OneHotEncoder cateogerical values  
encoder = OneHotEncoder(sparse_output=False)
Edata = encoder.fit_transform(df[categorical_columns])
E_df = pd.DataFrame(Edata, columns=encoder.get_feature_names_out(categorical_columns))

#Scaling the continuous columns
scaler = StandardScaler()
Sdata = scaler.fit_transform(df[continues_columns])
S_df=pd.DataFrame(Sdata, columns=continues_columns)
# Concatenate the encoded data with the numerical data from df
df_fit = pd.concat([S_df, E_df], axis=1)
print(df_fit)

In [None]:
# Let's randomly divide data two part -70% training 30% testing
df_random = df_fit.sample(frac=1, random_state=42)
Div_index = int(0.7 * len(df_random))
train_df = df_random[:Div_index]
test_df = df_random[Div_index:]

In [None]:
#LInear Regression modelling
y = train_df['price']
X = train_df.drop('price', axis=1)

# Model with all features
model_1 = sm.OLS(y, X)
result = model_1.fit()
result.summary()

As we see, model R2 is 1. which means model perfectly explain price, while Adj. R-squared is nan. it is result of lack of degree of freedom - we have 143 observarions and 217 different independed variables. We should elliminate all the variables which we can not reject hypotesis that their coefficient is not 0. 
for that purpose we use stepwise_regression library. ( jupyter is unable to install and call functions of stepwise_regression. so, we manually added them here)

In [None]:
# library taken from https://github.com/AakkashVijayakumar/stepwise-regression/blob/master/stepwise_regression/step_reg.py
def forward_regression(X, y,
                       threshold_in,
                       verbose=False):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:
            break

    return included

In [None]:
# Step_wise selection of significant features
dfc = train_df

# Separate features and target
X = dfc.drop('price', axis=1)
y = dfc['price']
forward_regression = forward_regression(X, y, 0.025)
print("Selected Features (forward_regression):")
print(forward_regression)


Below features are selected as a result of Step_wise selection:
['enginesize', 'curbweight', 'Brand_PORCSHCE', 'Brand_BMW', 'cylindernumber_four', 'Brand_BUICK', 'horsepower', 'Model_RX-7', 'carbody_sedan', 'Model_DAYZ', 'carbody_convertible', 'wheelbase', 'highwaympg', 'Brand_SAAB', 'Model_504(SW)', 'Model_304', 'Model_CENTURY']

In [None]:
#Model_2 - All the features selected from step_wise selection
y = train_df['price']
X2= train_df.loc[:,forward_regression]

# Model with all features
model_2 = sm.OLS(y, X2)
result2 = model_2.fit()
result2.summary()

R^2 of the model is 95% -high but we still have features with insignificant  coefficients. so, we remove variables with highest p value - enginesize in Model 3

In [None]:
#Model 3
y = train_df['price']
X3= X2.drop('enginesize', axis=1)
model_3 = sm.OLS(y, X3)
result3 = model_3.fit()
result3.summary()

In [None]:
#Model 4
y = train_df['price']
X4= X3.drop('Model_504(SW)', axis=1)
model_4 = sm.OLS(y, X4)
result4 = model_4.fit()
result4.summary()

In [None]:
#Model 5
y = train_df['price']
X5= X4.drop('Model_304', axis=1)
model_5 = sm.OLS(y, X5)
result5 = model_5.fit()
result5.summary()

In [None]:
#Model 6
y = train_df['price']
X6= X5.drop('Model_CENTURY', axis=1)
model_6 = sm.OLS(y, X6)
result6 = model_6.fit()
result6.summary()

In [None]:
#Model 7
y = train_df['price']
X7= X6.drop('highwaympg', axis=1)
model_7 = sm.OLS(y, X7)
result7 = model_7.fit()
result7.summary()

In [None]:
#Model 8
y = train_df['price']
X8= X7.drop('Brand_SAAB', axis=1)
model_8 = sm.OLS(y, X8)
result8 = model_8.fit()
result8.summary()

In [None]:
#Model 9
y = train_df['price']
X9= X8.drop('wheelbase', axis=1)
model_9 = sm.OLS(y, X9)
result9 = model_9.fit()
result9.summary()

In [None]:
#Model 10
y = train_df['price']
X10= X9.drop('carbody_convertible', axis=1)
model_10 = sm.OLS(y, X10)
result10 = model_10.fit()
result10.summary()

In [None]:
# Predict the target variable
Columns=X10.columns
y_test = test_df['price']
X_test= test_df[Columns]
y_pred = result10.predict(X_test)

# Calculate RMSE
mse = np.mean((y_test - y_pred) ** 2)
rmse = np.sqrt(mse)

print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
dflr=train_df
# Separate features and target
X =dflr.drop('price', axis=1)
y = dflr['price']

# Create and fit a linear regression model
model1 = LinearRegression()
model1.fit(X, y)

# For below part we asked for help of ChatGPT

model = LinearRegression()


# Perform feature selection with cross-validation
rfecv = RFECV(estimator=model)
X_selected = rfecv.fit_transform(X, y)

# Perform cross-validation on the selected features
cv_scores = cross_val_score(model, X_selected, y, cv=5, scoring='r2')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Average R^2:", cv_scores.mean())

# Get the selected features 
selected_features = X.columns[rfecv.support_]


# Print the selected features
print("Selected Features:", selected_features)




In [None]:
x_t=test_df.drop('price', axis=1)
x_test=rfecv.transform(x_t)
y_test=test_df['price']
model.fit(x_test, y_test)
ypredict=model.predict(x_test)
mse = mean_squared_error(y_test, ypredict)
print("RMSE:", mse**0.5)