In [177]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from custom_LR import LinearRegressionModel as CustomLR
from sklearn.metrics import r2_score,mean_absolute_error

In [178]:
df=pd.read_csv("dataset/used_car_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1725 entries, 0 to 1724
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 1725 non-null   int64  
 1   Id                         1725 non-null   int64  
 2   year                       1725 non-null   int64  
 3   brand                      1725 non-null   object 
 4   full_model_name            1725 non-null   object 
 5   model_name                 1725 non-null   object 
 6   price                      1725 non-null   float64
 7   distance_travelled(kms)    1725 non-null   float64
 8   fuel_type                  1725 non-null   object 
 9   city                       1725 non-null   object 
 10  brand_rank                 1725 non-null   int64  
 11  car_age                    1725 non-null   float64
 12  distance below 30k km      1725 non-null   int64  
 13  new and less used          1725 non-null   int64

In [179]:
df.head()


Unnamed: 0.1,Unnamed: 0,Id,year,brand,full_model_name,model_name,price,distance_travelled(kms),fuel_type,city,...,new and less used,inv_car_price,inv_car_dist,inv_car_age,inv_brand,std_invprice,std_invdistance_travelled,std_invrank,best_buy1,best_buy2
0,0,0,2016,Honda,Honda Brio S MT,Brio,425000.0,9680.0,Petrol,Mumbai,...,0,2.352941e-06,0.000103,0.2,0.142857,0.143417,0.03573,0.132143,677.134239,0.0
1,1,1,2012,Nissan,Nissan Sunny XV Diesel,Sunny,325000.0,119120.0,Diesel,Mumbai,...,0,3.076923e-06,8e-06,0.111111,0.090909,0.188859,0.002496,0.079545,37.501318,0.0
2,2,2,2017,Toyota,Toyota Fortuner 2.8 4x2 MT [2016-2020],Fortuner,2650000.0,64593.0,Diesel,Thane,...,0,3.773585e-07,1.5e-05,0.25,1.0,0.019416,0.004978,1.0,96.64602,0.0
3,3,3,2017,Mercedes-Benz,Mercedes-Benz E-Class E 220d Expression [2019-...,E-Class,4195000.0,25000.0,Diesel,Mumbai,...,1,2.38379e-07,4e-05,0.25,0.5,0.010692,0.013563,0.49375,71.604306,71.604306
4,4,4,2012,Hyundai,Hyundai Verna Fluidic 1.6 CRDi SX,Verna,475000.0,23800.0,Diesel,Mumbai,...,0,2.105263e-06,4.2e-05,0.111111,0.071429,0.127871,0.014269,0.059821,109.150857,0.0


## 🔹 Column "Unnamed:0" has the same values of "ID" column -->I can drop one of this columns

In [180]:
df.drop(columns="Unnamed: 0",inplace=True)

In [181]:
category_cols=df.select_dtypes(include="object").columns.tolist()
print("Object columns in dataset:" ,category_cols)

Object columns in dataset: ['brand', 'full_model_name', 'model_name', 'fuel_type', 'city']


## 🎯Key features are object type =>is needed to transform to numeric values to increase precision of prediction model 

In [None]:
print(df["fuel_type"].unique())

# Petrol + 1 means that this car can use petrol but also another fuel type
# CNG + 1 means that this car can use Compressed Natural Gas(CNG) but also another fuel type

['Petrol' 'Diesel' 'Petrol + 1' 'CNG + 1' 'Hybrid']


# Data Encoding Techniques in Machine Learning

- **Label Encoding**
- **Target Encoding**
- **One-Hot Encoding**


## Encoding Techniques Explained
### 1. Label Encoding
- Converts categorical values into numerical labels.
- Suitable for **ordinal data** where order matters (e.g., education level: High School < Bachelor < Master).
- Example: `brand` and `model_name` are encoded using this method.

### 2. Target Encoding
- Maps categorical variables to the mean of the target variable (e.g., price).
- Useful when the categorical variable has many unique values.
- Risk of overfitting; recommended when dealing with large datasets.
- Example: `full_model_name` and `city` are encoded this way.

### 3. One-Hot Encoding
- Creates binary columns for each category in the feature.
- Suitable for **nominal data** where no intrinsic order exists (e.g., fuel types: Petrol, Diesel, Electric).
- Works well when the number of categories is small.
- Example: `fuel_type` is encoded using one-hot encoding.

## When to Use Each Encoding
| Encoding Type | Best Use Case | Drawbacks |
|--------------|--------------|-----------|
| Label Encoding | Ordinal data | Can mislead models by implying a numerical relationship |
| Target Encoding | Large categorical features with target dependency | Risk of overfitting |
| One-Hot Encoding | Small categorical features, unordered categories | Can create a large number of features |


In [183]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders.target_encoder import TargetEncoder

label_enc = LabelEncoder()
target_enc = TargetEncoder()
onehot_enc = OneHotEncoder(sparse_output=False, drop="first")


df["brand"] = label_enc.fit_transform(df["brand"])
df["model_name"] = label_enc.fit_transform(df["model_name"])

df["full_model_name"] = target_enc.fit_transform(df["full_model_name"], df["price"])
df["city"] = target_enc.fit_transform(df["city"], df["price"])

fuel_type_encoded = pd.DataFrame(
    onehot_enc.fit_transform(df[["fuel_type"]]),
    columns=onehot_enc.get_feature_names_out(["fuel_type"]),
)

df = df.drop(columns=["fuel_type"]).join(fuel_type_encoded)

In [184]:
corr_matrix=df.corr()["price"].abs().sort_values(ascending=False)
corr_matrix

price                        1.000000
full_model_name              0.969703
std_invprice                 0.517723
inv_car_price                0.517723
city                         0.336974
year                         0.288483
car_age                      0.288483
fuel_type_Diesel             0.269330
inv_car_age                  0.267973
fuel_type_Petrol             0.260109
new and less used            0.219786
distance below 30k km        0.212197
std_invrank                  0.185660
inv_brand                    0.185660
brand_rank                   0.164591
distance_travelled(kms)      0.137351
brand                        0.112988
best_buy1                    0.106855
Id                           0.105696
model_name                   0.082217
std_invdistance_travelled    0.081735
inv_car_dist                 0.081735
fuel_type_Petrol + 1         0.034109
fuel_type_Hybrid             0.008721
best_buy2                    0.008077
Name: price, dtype: float64

# 🚗 Used Car Dataset Analysis

## 📊 Correlation Insights
After analyzing the correlation between the **target column (`price`)** and other columns, I noticed that some columns represent the same concept but in slightly different ways:

- **`std_invprice`** and **`inv_car_price`**
- **`car_age`** and **`year`**
- **`std_invdistance_travelled`** and **`inv_car_dist`**
- **`inv_brand`** and **`brand_rank`**

🔹 **Decision:** I will **not** drop these columns, as they may be useful during model training.  

---

## 💡 Business Perspective on High-Correlation Columns

From a business standpoint, the columns with the highest correlation index are:

0. **`full_model_name`**  
   - In some rows full model name include:fuel_type,year,type_of_traction,brand,and summing these parameters result high correlation

1. **`std_invprice`** 📈  
   - Directly calculated from the actual car price.
   
2. **`year`** 📅  
   - Newer cars tend to have **higher prices**.  
   - However, the **full model name** and **brand** are also impact price.
   
3. **`fuel_type`** ⛽  
   - Some buyers prioritize fuel type due to price variations.

---

## 🔍 Missing Features in the Dataset
To improve the dataset’s usefulness, it should also include:

- **Engine capacity** (e.g., 1.6L, 2.0L) for all cars ⚙️  
- **Fuel consumption per 100 km** ⛽  
- **Environmental classification** (e.g., Euro 6 / Euro 5 / Euro 4) 🌍  

---

🚀 **Conclusion:** These additional features would enhance model accuracy and make the dataset more valuable for predictive analysis.


In [185]:
print("Max value in X_train:\n", df.max())
print("Min value in X_train:\n", df.min())

Max value in X_train:
 Id                           1.724000e+03
year                         2.021000e+03
brand                        3.000000e+01
full_model_name              3.212941e+06
model_name                   1.680000e+02
price                        1.470000e+07
distance_travelled(kms)      7.900000e+05
city                         3.800396e+06
brand_rank                   8.100000e+01
car_age                      3.100000e+01
distance below 30k km        1.000000e+00
new and less used            1.000000e+00
inv_car_price                1.600000e-05
inv_car_dist                 2.857143e-03
inv_car_age                           inf
inv_brand                    1.000000e+00
std_invprice                 1.000000e+00
std_invdistance_travelled    1.000000e+00
std_invrank                  1.000000e+00
best_buy1                    2.477518e+03
best_buy2                    2.477518e+03
fuel_type_Diesel             1.000000e+00
fuel_type_Hybrid             1.000000e+00
fuel_type_P

In [186]:
num_inf_values = np.isinf(df["inv_car_age"]).sum()
print(f"Number of inf values in 'inv_car_age': {num_inf_values}")

Number of inf values in 'inv_car_age': 21


### Remove rows where have inf/-inf values in inv_car_age column

In [None]:
df = df[~np.isinf(df["inv_car_age"]) & df["inv_car_age"].notna()].reset_index(drop=True)

## Select Significant Features more than 0.5

In [188]:
selected_features=corr_matrix[(corr_matrix>0.5)].index.tolist()
df_selected=df[selected_features]
selected_features


['price', 'full_model_name', 'std_invprice', 'inv_car_price']

## 📌Define and Verify X_train X_test y_train y_test for NaN Values 

In [189]:
X=df.drop(columns=["price","full_model_name"])
#drop full_model_name because correlation is 0.96
y=df["price"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [191]:
num_nan_values = df.isna().sum()
print(num_nan_values[num_nan_values > 0])

Series([], dtype: int64)


In [192]:
print(X_train.isna().sum()[X_train.isna().sum() > 0])
print(X_test.isna().sum()[X_test.isna().sum() > 0])

Series([], dtype: int64)
Series([], dtype: int64)


In [193]:
print(y_train.isna().sum())
print(y_test.isna().sum())

0
0


## 📌Train and Evaluate Full Dataset

### Custom Liniar Regression

In [194]:
model_custom = CustomLR(learning_rate=0.001, iterations=1000, regularization=None)
model_custom.fit(X_train, y_train)
y_pred=model_custom.predict(X_test)
score_custom_LR = model_custom.score(y_test,y_pred)
print("Liniar Regression Score of the model using custom implementation is:",score_custom_LR,)
print("MAE Custom Liniar Regression:",mean_absolute_error(y_test,y_pred))

  gradients = (X.T @ (X @ self.theta - y)) / m  # Compute gradient


ValueError: Theta contains NaN or Inf values during training.

# Investigating NaN Values in Custom Linear Regression


After some analysis, I discovered that these NaN values are most likely generated during the calculations in either of the following methods:

- **`predict`**: The method responsible for making predictions based on the trained model.
- **`fit`**: The method used for training the model on the given data.

### Sklearn Liniar Regression

In [None]:
model_sklearn=LinearRegression()
model_sklearn.fit(X_train,y_train)


score_sklearn=model_sklearn.score(X_test,y_test)
print("Liniar Regression Score of the model using sklearn is:",score_sklearn)

Liniar Regression Score of the model using sklearn is: 0.39187419993876704


## 📌Train and Evaluate selected features from dataset

In [195]:
X = df_selected.drop(columns=["price", "full_model_name"])
# drop full_model_name because correlation is 0.96

y=df_selected["price"]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [197]:
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
score=model.score(X_test,y_test)
score

0.19914574213573655

In [196]:
custom_model = CustomLR(learning_rate=0.001, iterations=1000, regularization=None)
custom_model.fit(X_train,y_train)
y_pred_custom=custom_model.predict(X_test)

score_custom=r2_score(y_test,y_pred)
score_custom

0.19914574213573655

Towards the end of my work with this dataset, I discovered repetitive columns that don't play a significant role, but I didn't remove them to maintain a somewhat appropriate result for linear regression. Initially, I used LabelEncoder for all object-type columns, and the highest correlation was 0.51. Then I realized I might not be using the most appropriate encoding method for some columns,so I modified the strategy for training and used (LabelEncoder TargetEncoder and OneHotEncoder)

Correlations between 0.5 and 0.8 are extremely rare, which is why the result obtained with Sklearn.LinearRegression was low. If I had included the full_model_name correlation with the target column (which is 0.96), the r2_score would have increased to 0.92.

Regarding my Linear Regression model, the implementation is correct, during calculations are producing some numerical issues,this is the cause why r2_score do not work and have a error.

> ⚠ **Warning**  
> Input has NaN values.  
> `ValueError: Theta contains NaN or Inf values during training.`
