In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from matplotlib import pyplot as plt # data visualization
sns.set_theme(style="darkgrid")
sns.set_palette("Set2")
plt.rcParams['figure.figsize']=(8,3)

import time
import re
import pickle

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
cdf = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv")
cdf_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv")

In [None]:
cdf

# **1. Knowing the Data:~🔍🏂**

In [None]:
cdf.set_index('id', inplace=True)
cdf_test.set_index('id', inplace=True)

In [None]:
test_idx = cdf_test.index #to be used while submitting

In [None]:
display(cdf.columns)
display(cdf_test.columns)

In [None]:
cdf.rename(columns={"ext_col":"ext_color", "int_col":"int_color", "milage":"mileage"}, inplace=True)
cdf_test.rename(columns={"ext_col":"ext_color", "int_col":"int_color", "milage":"mileage"}, inplace=True)

In [None]:
display(cdf.sample(5))
display(cdf_test.sample(5))

In [None]:
print(f"Shape of training data: {cdf.shape}, \nShape of test data: {cdf_test.shape}")

In [None]:
cdf.isnull().sum()

In [None]:
cdf_test.isnull().sum()

In [None]:
display(cdf[cdf.duplicated()])
display(cdf_test[cdf_test.duplicated()])

### **SUMMARY from above //---**
* The train dataset has 13 columns, with "id" column set as index. So effectively 12. The test data has 11.
* There are **188533 rows in the train data.** While the **test dataset has 1.25lac rows.** 
* **No duplicates** in either of datset. **Both has three columns that has some null values.**
* "price" is the target column. Have **changed ext_col & int_col column names to ext_color & int_colour,** meaning Exterior & Interior Colour.

## **1.1. Details of the Numerical Columns:**

In [None]:
cdf.describe()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 3), sharey=True)
fig.suptitle('Box Plots of --')

# mileage
sns.boxplot(ax=axes[0], x=cdf["mileage"])
axes[0].set_title("Mileage of Cars")

# price
sns.boxplot(ax=axes[1], x=cdf["price"])
axes[1].set_title("Price of Cars")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 4))

# mileage
sns.histplot(x=cdf["mileage"],ax=axes[0],kde=True, bins=25)
axes[0].set_title("Mileage of Cars")

# price
sns.histplot(x=cdf["price"],ax=axes[1], bins=50)
axes[1].set_title("Price of Cars")
plt.show()

In [None]:
cdf.model_year.value_counts().head(5)

In [None]:
plt.rcParams['figure.figsize']=(12,4)
yvc = cdf.model_year.value_counts().sort_index()
yr = sns.lineplot(x=list(yvc.index), y=yvc)
plt.xticks(list(yvc.index)[::2], rotation=90)

for i in range(0,len(yvc),2):
    yr.vlines(x=yvc.index[i],ymin=0,ymax=yvc.values[i],colors='grey',)
    yr.text(yvc.index[i],yvc.values[i], f"{yvc.values[i]}",ha='center',va="top", fontweight='bold', fontsize=9)

plt.title("No. of cars per model_year")
yr.set_yticklabels([])
plt.show()

## **1.2. Details of the Categorical Columns:**

In [None]:
cdf.describe(include="object")

In [None]:
cdf.accident.value_counts().plot(kind="pie", title="Reports for Accident", ylabel="",
                                y=cdf.accident.value_counts(), figsize=(3,3),
                                autopct="%1.01f%%")
plt.show()

##### Let's make a little change to replace these long values "At least 1 accident or damage reported" & "None reported"

In [None]:
cdf.rename(columns={"accident":"accident_reported"}, inplace=True)
cdf_test.rename(columns={"accident":"accident_reported"}, inplace=True)
cdf.accident_reported = np.where(cdf.accident_reported=="None reported","No","Yes")
cdf_test.accident_reported = np.where(cdf_test.accident_reported=="None reported","No","Yes")

In [None]:
cdf.sample(3)

In [None]:
cdf.fuel_type.value_counts()

In [None]:
cdf.engine.value_counts()

In [None]:
cdf.transmission.value_counts(normalize=True)*100

In [None]:
display(cdf.ext_color.value_counts())
display(cdf.int_color.value_counts())

In [None]:
display(cdf.clean_title.value_counts())
cdf.clean_title.isnull().sum()

### **SUMMARY from above //---**
* **Mileage column is right skewed** as it generally should be. **Price column has a lot of outliers,** probably fo the presence of a few supar cars.
* There are 12 cars of model year 1974. Most cars are of years between 2015 & 2022.
* **23% cars have at least one accident or damage being reported.**
* **`engine` & `transmission` columns have so many unique values.** Some column transformation can be done on those columns like creating a few smaller columns like Cylinder No, or Capacity etc.
* Color columns have many unique values as well, black being the most frequent for both Exterior and Interior color.

# **2. Data Cleaning and Processing:~ 🧹🏗️**

In [None]:
cdf.isnull().sum()

## **2.1. Managing the `Fuel type` column** --

In [None]:
cdf.fuel_type.unique()

In [None]:
cdf_test.fuel_type.unique()

### >> Data transformation ~

In [None]:
cdf['fuel_type'] = cdf['fuel_type'].replace(['–', 'not supported'], 'None')
cdf_test['fuel_type'] = cdf_test['fuel_type'].replace(['–', 'not supported'], 'None')

### >> Null Value Handling ~ 

In [None]:
cdf.fuel_type = cdf.fuel_type.fillna("Unknown")
cdf_test.fuel_type = cdf_test.fuel_type.fillna("Unknown")

In [None]:
cdf.fuel_type = cdf.fuel_type.apply(lambda x:x.replace(" ","_")).apply(lambda x:x.replace("-","_"))
cdf_test.fuel_type = cdf_test.fuel_type.apply(lambda x:x.replace(" ","_")).apply(lambda x:x.replace("-","_"))

## **2.2. Managing the `Transmission` column** --

In [None]:
cdf.transmission.unique()

### >> Data Extraction ~
#### Transmission column have **more than 50 unique values** in it. That is because 2-3 different data are clubbed together in this column. So, those information like **no. of gears & transmission type are extracted from it.**

In [None]:
cdf.transmission = cdf.transmission.replace({"Single-Speed Fixed Gear":"1-Speed Fixed Gear"})
cdf_test.transmission = cdf_test.transmission.replace({"Single-Speed Fixed Gear":"1-Speed Fixed Gear"})

In [None]:
def extract_gear_and_txtype(transmission_info):
    pattern = re.search(r'(\d{1,2}[\s-]?speed?)?\s*(Automatic|Electronically Controlled Automatic|At\/Mt|A\/T|AT|M\/T|CVT|Manual|Variable|Transmission Overdrive|Fixed|DCT|Mt|Transmission w/Dual Shift Mode)?\s*',transmission_info,re.IGNORECASE)
    
    gear = pattern.group(1) if pattern.group(1) else None
    txtype = pattern.group(2) if pattern.group(2) else "Other"
    return gear, txtype

def load_gear_and_txtype(df):
    gear = []
    transmission_type = []
    for tx in df.transmission:
        ngear,txtype = extract_gear_and_txtype(tx)

        if ngear!=None:
            ngear = ngear.split("-")[0].split(" ")[0] #to tackle both 6-speed & 6 speed
            if ngear.lower()=="single":ngear=1
            else:ngear = int(ngear)

        if txtype!=None:
            if txtype=="At/Mt": txtype="AMT"
            elif txtype.lower() in ['a/t','at','transmission overdrive']: txtype = "Automatic"
            elif txtype.lower() in ['m/t','mt']: txtype = "Manual"
            elif txtype.lower()=="variable": txtype="CVT"
            elif txtype=="Transmission w/Dual Shift Mode": txtype="Dual_Shift"
            elif txtype=="Electronically Controlled Automatic": txtype="Electronically_controlled"

        gear.append(ngear)
        transmission_type.append(txtype)


    df["gears"] = gear
    df["transmission_type"] = transmission_type
    
    return df


In [None]:
cdf = load_gear_and_txtype(cdf)
cdf_test = load_gear_and_txtype(cdf_test)

In [None]:
display(cdf.gears.unique())
display(cdf_test.gears.unique())

In [None]:
display(cdf.transmission_type.unique())
display(cdf_test.transmission_type.unique())

In [None]:
cdf.transmission_type.value_counts()

## **2.3. Working with the `Engine` column** --

In [None]:
cdf.engine.nunique()

In [None]:
cdf.engine.sample(5)

### >> Data Extraction ~
#### Engine column too have a lot of unique values in it **(more than 50 unique values).** Again, this is because 3-4 different information are concatenated in this column. So, data related to **HorsePower, Capacity, & no. of Cylinders are extracted from it.**

In [None]:
def extract_engine_data(engine_info):
    pattern = re.search(r'(\d{1,4}.\d{1,2}HP)?\s?(\d{1,2}.\d{1,2}[L|\sLiter])?[A-Za-z\s]{0,}(\d{1,2})?\s?[Cylinder]{0,}',engine_info,re.IGNORECASE)
    
    hp = pattern.group(1) if pattern.group(1) else None
    capacity = pattern.group(2) if pattern.group(2) else None
    cylinders = pattern.group(3) if pattern.group(3) else None
    return hp, capacity, cylinders

def load_engine_data(df):
    horsepower, capacity, cylinders = [],[],[]
    i=0
    for engine in df.engine:
        hp, cap, cy = extract_engine_data(engine)
        if hp!=None:
            hp = float(hp.lower().split("hp")[0]) 

        if cap!=None:
            cap = float(cap.lower().split("l")[0])    

        if cy!=None:
            cy = float(cy.split(" ")[0])

        horsepower.append(hp)
        capacity.append(cap)
        cylinders.append(cy)


    df["horsepower"] = horsepower
    df["engine_capacity"] = capacity
    df["cylinders"] = cylinders
    
    return df

In [None]:
cdf = load_engine_data(cdf)
cdf_test = load_engine_data(cdf_test)

In [None]:
cdf[['engine','horsepower','engine_capacity','cylinders']].sample(5)

In [None]:
cdf_test[['engine','horsepower','engine_capacity','cylinders']].sample(5)

##### nan values in the newly added columns will be fixed later.

## **2.4. Fixing the `Exterior & Interior Color` columns** --

In [None]:
cdf.ext_color.nunique()

In [None]:
cdf.ext_color.value_counts(normalize=True).head(15)

In [None]:
cdf.int_color.value_counts(normalize=True).head(15)

### >> Data transformation ~
#### Colors columns are having many unique values as well because different color shades are present of a same parent color, e.g. both "Jet Black" & "Black" is present in the data. 
##### 
#### **The plan is to keep some of the common base colors. If a color has that base color in it, it will be changed to the base color, or else the color will be made uncommon,** i.e. "Jet Black" will be transformed to "Black" itself, but suppose we are not considering "pink" as our base color, so it will become 'uncommon'. 

In [None]:
base_colors = ['white','black','grey','gray','blue','red','yellow','silver','green','beige','gold','orange','brown','ebony','purple']

def find_base_color(text):
        for color in base_colors:
            if color in text:
                return color
        return "uncommon"  
    
def transform_color(df):    
    df.ext_color = df.ext_color.apply(lambda x: x.lower() if isinstance(x, str) else x)
    df.int_color = df.int_color.apply(lambda x: x.lower() if isinstance(x, str) else x)
    df.ext_color = df.ext_color.apply(find_base_color)
    df.int_color = df.int_color.apply(find_base_color)
    df.ext_color = df.ext_color.replace({"grey":"gray"})
    df.int_color = df.int_color.replace({"grey":"gray"})
    
    return df

cdf = transform_color(cdf)
cdf_test = transform_color(cdf_test)

In [None]:
cdf.ext_color.unique()

In [None]:
cdf.ext_color.value_counts(normalize=True)*100

In [None]:
cdf.int_color.value_counts(normalize=True)*100

In [None]:
cdf.sample(2)

## **2.5. Dealing with the `clean title` columns** --

In [None]:
display(cdf.clean_title.value_counts())
print("----")
display(cdf_test.clean_title.value_counts())

In [None]:
cdf[['accident_reported','clean_title']].value_counts()

### >> Handling Null values ~
#### Logically clean title is dependent on accident/damage reports. If there are any Severe damage, its value is false. But cars with no reports of accident or minimal accidents... generally has a clean title. The above table show this.
#####  
#### So, to handle the Null values in this column, will take reference from the accident_reported column.

In [None]:
def fill_clean_title(row):
    if pd.isna(row['clean_title']):
        if row['accident_reported'] == 'No':
            return "Yes"
        elif row['accident_reported'] == 'Yes':
            return "No"
    return row['clean_title']

# Apply the function to each row
cdf.clean_title = cdf.apply(fill_clean_title, axis=1)
cdf_test.clean_title = cdf_test.apply(fill_clean_title, axis=1)

In [None]:
cdf[['accident_reported','clean_title']].value_counts()

In [None]:
cdf_test[['accident_reported','clean_title']].value_counts()

## **2.6. Managing the `brand & Model` columns** --

In [None]:
cdf.brand.unique()

In [None]:
cdf.model.nunique()

In [None]:
cdf[['brand','model']].value_counts().head(50)

In [None]:
cdf[["brand","price"]].groupby('brand').median(['price']).sort_values("price",ascending=False).head(10)

### >> Data transformation ~
#### Have classified the cars into 5 classes based on the brand's price, performance and utilizations. This is feature reduction as well as to maintain the brand-value as is.

In [None]:
# Car brand categories
categories = {
    'Exotic_car': [
        'Bugatti', 'Ferrari', 'Lamborghini', 'McLaren', 'Rolls-Royce',
        'Bentley', 'Aston', 'Koenigsegg', 'Pagani'
    ],
    'Super_car': ['Porsche', 'Maserati', 'Lotus'],
    'Luxury_car': [
        'Mercedes-Benz', 'Audi', 'BMW', 'Genesis', 'Cadillac', 'Lincoln',
        'Land', 'Jaguar', 'Tesla', 'Lexus', 'INFINITI', 'Acura',
        'Polestar', 'Maybach'
    ],
    'Premium_car': [
        'Volvo', 'Volkswagen', 'Buick', 'Rivian', 'RAM', 'Alfa', 'Jeep'
    ],
    'Standard_car': [
        'MINI', 'Chevrolet', 'Ford', 'GMC', 'Toyota', 'Hyundai', 'Kia', 
        'Mitsubishi', 'Honda', 'Nissan', 'Mazda', 'Subaru', 'Chrysler', 'Lucid', 
        'Scion', 'smart', 'Karma', 'Plymouth', 'Suzuki', 'FIAT', 'Saab', 
        'Pontiac', 'Saturn', 'Dodge', 'Hummer', 'Mercury'
    ]
}

# Function to classify brands
def classify_brand(brand):
    for category, brands in categories.items():
        if brand in brands:
            return category
    return "Other"

# Apply classification to the DataFrame
cdf['car_category'] = cdf.brand.apply(classify_brand)
cdf_test['car_category'] = cdf_test.brand.apply(classify_brand)


In [None]:
cdf_test.car_category.value_counts()

In [None]:
cdf.sample(5)

## **2.7. Handling Null values in NEW Numerical columns** --

#### The plan is to fill the null values with the mode of corresponding columns of those rows that have the same brand and model name. If that also is null, null is filled with mode of same brand name only. 

In [None]:
cdf.isnull().sum()

In [None]:
# def fill_gears_na(df, row):
#     same_brand = row['brand']
#     same_model = row['model']

#     try:
#         same_gears = df[(df.brand==same_brand) & (df.model==same_model)].gears.mode()[0]
#         return same_gears
#     except:
#         return df[df.brand==same_brand].gears.mode()[0]       


# for i in range(len(cdf)):
#     if pd.isna(cdf.loc[i,'gears']):
#         cdf.loc[i,'gears'] = fill_gears_na(cdf, cdf.loc[i,:])


## THIS TAKES LOT OF TIME TO RUN

In [None]:
cdf

In [None]:
def fill_na_with_same_brand_model_mode(df, grouping_cols, target_cols):
    
    # Mode Calculator
    def calculate_mode(series):
        mode = series.mode()
        return mode.iloc[0] if not mode.empty else np.nan
    
    # NA filler function
    def impute_na(row):
        if pd.notna(row[col]):
            return row[col]
        if pd.notna(row['brand_model_mode']):
            return row['brand_model_mode']
        if pd.notna(row['brand_mode']):
            return row['brand_mode']

        return row[col]
    
    #-------------
    for col in target_cols:
        # find mode of same brand & model
        brand_model_mode = df.groupby(grouping_cols)[col].apply(calculate_mode).rename("brand_model_mode")
        
        # find mode of same brand
        brand_mode = df.groupby(grouping_cols[0])[col].apply(calculate_mode).rename("brand_mode")
        
        #merging the series to have corresponding mode for each brand & band_model combo in each rows
        df = df.merge(brand_model_mode, on=grouping_cols, how='left')
        df = df.merge(brand_mode, on=grouping_cols[0], how='left')
        
        #applying na_filler function
        df[col] = df.apply(impute_na, axis=1)
        
        #deleting newly added cols
        df.drop(columns=['brand_model_mode','brand_mode'],inplace=True)
    
    return df
 

In [None]:
cdf = fill_na_with_same_brand_model_mode(cdf, ['brand','model'], ['gears','horsepower','engine_capacity','cylinders'])
cdf_test = fill_na_with_same_brand_model_mode(cdf_test, ['brand','model'], ['gears','horsepower','engine_capacity','cylinders'])

In [None]:
cdf.isnull().sum()

In [None]:
# 10 rows had null in them. All for same model: Bugatti--Veyron 16.4 Grand Sport: 1200 HP checked from google and filled.

cdf.horsepower = cdf.horsepower.fillna(1200)
cdf_test.horsepower = cdf_test.horsepower.fillna(1200)

## **2.8. New Column in place of `Model year`** --

#### year as in 2002, 2020, 2023 doesn't have any numerical info in it. It is more like a categorical column only. So, a new column `car_age` is created to have that longivity feel for the cars.

In [None]:
cdf['car_age'] = 2024-cdf.model_year
cdf_test['car_age'] = 2024-cdf_test.model_year

In [None]:
cdf.sample(5)

## **2.9. Dropping unwanted columns** --

#### As a result of the above part, EDA-2 is done. We have handled all Null values, processed & transformed columns to modify the existing columns... as well as to create a few new ones. 
* Let's remove the redundant columns now. &
* modify the dataset view.

In [None]:
cdf.drop(columns=['brand','model','model_year','engine','transmission'], inplace=True)
cdf_test.drop(columns=['brand','model','model_year','engine','transmission'], inplace=True)

In [None]:
cdf = cdf[['car_category', 'fuel_type', 'transmission_type', 'int_color', 'ext_color', #cat_cols
           'accident_reported', 'clean_title', #binary_cols
           'car_age', 'mileage', 'gears', 'horsepower', 'engine_capacity', 'cylinders', 'price']] #num_cols

cdf_test = cdf_test[['car_category', 'fuel_type', 'transmission_type', 'int_color', 'ext_color', #cat_cols
                     'accident_reported', 'clean_title', #binary_cols
                     'car_age', 'mileage', 'gears', 'horsepower', 'engine_capacity', 'cylinders']] #num_cols

In [None]:
cdf_test

# **4. Correlation Analysis:~**

In [None]:
cat_cols = cdf.select_dtypes(include=['object']).columns
cdf_encoded = pd.get_dummies(cdf, columns=cat_cols)

corr_data = cdf_encoded.corr()

In [None]:
print("Positive Corr with Price column.\n")
display(corr_data['price'].sort_values(ascending=False).head(10))

print("--------------------------\nBest Corr (+ve & -ve) with Price column.\n")
display(abs(corr_data['price']).sort_values(ascending=False).head(20))

In [None]:
# correlation around color columns
corr_data.loc[['car_age', 'mileage', 'gears', 'horsepower', 'price','car_category_Exotic_car',
       'car_category_Luxury_car', 'car_category_Premium_car',
       'car_category_Standard_car', 'car_category_Super_car'],['int_color_beige', 'int_color_black',
       'int_color_blue', 'int_color_brown', 'int_color_ebony',
       'int_color_gold', 'int_color_gray', 'int_color_green',
       'int_color_orange', 'int_color_red', 'int_color_silver',
       'int_color_uncommon', 'int_color_white', 'int_color_yellow',
       'ext_color_beige', 'ext_color_black', 'ext_color_blue',
       'ext_color_brown', 'ext_color_ebony', 'ext_color_gold',
       'ext_color_gray', 'ext_color_green', 'ext_color_orange',
       'ext_color_purple', 'ext_color_red', 'ext_color_silver',
       'ext_color_uncommon', 'ext_color_white', 'ext_color_yellow']].T

* **Older cars' interior color used to be mostly 'gray' or 'beige'**, *'black' used to be very rare in those time.* The **more recent ones have 'black' as interior color more frequently.**
* **Exterior color tends to be mostly 'silver' or 'gold' in older cars.** 
* **Uncommon colors** that we assigned **are mostly for the recent cars, `MOSTLY the Exotic cars.` These have uncommon Interior & Exterior colors in abundance.**
* **Price column have very low correlation with all these color,** which suggests that in this data, **Price doesn't depend much on Interior or Exterior colors.**
* **Most common internal color of Luxury cars is 'beige'**, and there is no specificity in External colors for these.
* **Uncommon internal color is a rarity for Premium, Super or Standard cars.**
* **Gray is the most common interior color of Standard cars amongst all other colors.**

In [None]:
# correlation around car category column
corr_data.loc[['car_age', 'mileage', 'price'],['car_category_Exotic_car',
       'car_category_Luxury_car', 'car_category_Premium_car',
       'car_category_Standard_car', 'car_category_Super_car']].T

* **Exotic cars have a good correlation with Price,** which is not there for any other types of cars
* **Older cars are mostly the Standard ones**

In [None]:
# correlation around fuel tye column
corr_data.loc[['car_age', 'mileage', 'price','horsepower','cylinders','gears','engine_capacity'],['fuel_type_Diesel', 'fuel_type_E85_Flex_Fuel', 'fuel_type_Gasoline',
       'fuel_type_Hybrid', 'fuel_type_None', 'fuel_type_Plug_In_Hybrid',
       'fuel_type_Unknown']].T

* **Price again don't have much dependancy on what type of fuel is used.**
* **None of these fuel's used to be there in older cars**
* Missing Fuel type specified in the actual data, had higher HorsePower. 
* **Diesel or Flex-fuel type cars tends to have higher CC** in this data.
* **No of gears is generally less for those cars that didn't had any fuel type mentioned**, (could be the automated ones mainly) and the **cars with Gasoline fuel have higher number of gears.**

In [None]:
# correlation around transmission type column
corr_data.loc[['car_age', 'mileage', 'price','horsepower','cylinders','gears','engine_capacity'],['transmission_type_AMT',
       'transmission_type_Automatic', 'transmission_type_CVT',
       'transmission_type_DCT', 'transmission_type_Dual_Shift',
       'transmission_type_Electronically_controlled',
       'transmission_type_Fixed', 'transmission_type_Manual',
       'transmission_type_Other']].T

* **It is evident that the older cars are mainly Manual, and the recent ones are Automatic.** Correlation analysis supports that.
* **Dual shift cars have higher horsepower. For CVT, it's the opposite.**
* **Yeah, Manual cars have comparatively lesser gear.**
* **PRICE COLUMN AGAIN HAVE LESS DEPENDANCY ON Transmission Type.**

In [None]:
# correlation around accident & clean_title columns
corr_data.loc[['car_age', 'mileage', 'gears', 'horsepower', 'engine_capacity',
       'cylinders', 'price'],['accident_reported_No', 'accident_reported_Yes', 'clean_title_No',
       'clean_title_Yes']].T

In [None]:
corr_data[['clean_title_No','clean_title_Yes']].T

* **Older cars have more no. of accident reported than the recent ones.** 
* **Cars with no incident of accident reported have higher price, that those that have some reports.**
* Clean title is not having any significant correlation with any of the columns OTHER THAN accident being reported, which is obvious. 

In [None]:
cdf.head(1)

In [None]:
cdf_cleanned = cdf.drop(columns=['int_color','ext_color','clean_title'])
cdf_test_cleanned = cdf_test.drop(columns=['int_color','ext_color','clean_title'])

# **5. Transforming Data into model-understood form:~**

In [None]:
from sklearn.model_selection import train_test_split

cdf_X = cdf.drop(columns=['price'])
cdf_Y = cdf.loc[:,'price']

cdf_Xtrain, cdf_Xtest, cdf_Ytrain, cdf_Ytest = train_test_split(cdf_X, cdf_Y, test_size=0.25, random_state=25)

In [None]:
print(f"Shape of \n------\nX-train:{cdf_Xtrain.shape}, X-test:{cdf_Xtest.shape}\nY-train:{cdf_Ytrain.shape}, Y-test:{cdf_Ytest.shape}")

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
print(f"Total no. of categories in all cat cols combined: {cdf.iloc[:,:7].nunique().sum()}")
print(f"No. of cat cols: {cdf.iloc[:,:7].nunique().count()}")
print(f"No. of columns created after OHE: {cdf.iloc[:,:7].nunique().sum()-cdf.iloc[:,:7].nunique().count()}")

In [None]:
#One Hot Encoding 

encoder = ColumnTransformer([
    ("ohe", OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'),slice(0,7))
],remainder='passthrough')

# --  after encoding, with drop=first, there will be total 47 encoded columns + the 6 numerical cols


#Standard Scaling

scaler = ColumnTransformer([
    ('stdScaler',StandardScaler(),slice(0,53))
],remainder='passthrough')

In [None]:
pipe = Pipeline([
    ("one_hot_encoder", encoder),
    ("standard_Scaler", scaler)
])

pipe.fit(cdf_Xtrain)

In [None]:
cdf_Xtrain = pipe.fit_transform(cdf_Xtrain)
cdf_Xtest = pipe.fit_transform(cdf_Xtest)

print(f"Shape of cdf_Xtrain array: {cdf_Xtrain.shape}")
print(f"Shape of cdf_Xtest array: {cdf_Xtest.shape}")

print(f"Shape of cdf_Ytrain array: {cdf_Ytrain.shape}")
print(f"Shape of cdf_Ytest array: {cdf_Ytest.shape}")

In [None]:
# original test data
cdf_test = pipe.fit_transform(cdf_test)
print(f"Shape of cdf_test array: {cdf_test.shape}")

### **SUMMARY from above //---**
* Done **train-test-split at 75-25** split-proportion. 
* Used **One-Hot-Encoding to encode the categorical columns, with dropping the first coded category.**
* Transformed all the columns by Standardizing them using **Standard Scaler**, as the dataset has many outliers.
* These two tranformers were combined to create a pipe, which was trained on the train data.
* The training data have **53 columns after Encoding** was made.

# **6. Model Building:~**

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
# from sklearn.neural_network import MLPRegressor

In [None]:
default_models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(),GradientBoostingRegressor(),
                  AdaBoostRegressor(),XGBRegressor(),LGBMRegressor(),CatBoostRegressor()]
names = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','Adaboost','XGB','Light gbm','Catboost']
model_result = []

for i,model in enumerate(default_models):
    start=time.time()
    
    model.fit(cdf_Xtrain,cdf_Ytrain)
    print(f"{names[i]} is ready to predict:~ \n---------------")
    y_pred = model.predict(cdf_Xtest)
    
    tt = time.time()-start
    r2 = round(r2_score(cdf_Ytest,y_pred),2)
    rmse = round(mean_squared_error(cdf_Ytest,y_pred,squared=False),2)
    mae = round(mean_absolute_error(cdf_Ytest,y_pred),2)
    
    result = [names[i],tt,r2,rmse,mae]
    model_result.append(result)
    
    print(f"RMSE on train data: {rmse}\n")
    print(f"{names[i]} is ready to predict the test data:~ \n---------------")
    sub = model.predict(cdf_test)
    pd.DataFrame({'id':test_idx, 'price':sub}).to_csv(f"My Submission __{i+1}.csv", index=False)

default_model_report = pd.DataFrame(model_result, columns=['model','time_taken','r2_score','rmse','mae'])


In [None]:
default_model_report

# **7. Model Hyper-parameter tunning:~**

## **7.1. Voting Technique**

In [None]:
lr = LinearRegression()
gb = GradientBoostingRegressor()
lgbm = LGBMRegressor()

lr.fit(cdf_Xtrain,cdf_Ytrain)
sub_lr = lr.predict(cdf_test)

gb.fit(cdf_Xtrain,cdf_Ytrain)
sub_gb = gb.predict(cdf_test)

lgbm.fit(cdf_Xtrain,cdf_Ytrain)
sub_lgbm = lgbm.predict(cdf_test)

sub = sub_lr*0.2 + sub_gb*0.4 + sub_lgbm*0.4
pd.DataFrame({'id':test_idx, 'price':sub}).to_csv(f"My Submission __LR_GB_LGBM_default.csv", index=False)


# **8. Model Building on Shrank Data:~**

#### This part is to do the same Model building and prediction on the Data that is cleaned after Correlation Analysis, i.e. with a few columns ... which didn't had impacting correlation value with the target(price) column... being Dropped.  

In [None]:
X = cdf_cleanned.drop(columns=['price'])
Y = cdf_cleanned.loc[:,'price']

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.25, random_state=25)

In [None]:
print(f"Shape of \n------\nX-train:{Xtrain.shape}, X-test:{Xtest.shape}\nY-train:{Ytrain.shape}, Y-test:{Ytest.shape}")

In [None]:
Xtrain

In [None]:
#One Hot Encoding 

encoder = ColumnTransformer([
    ("ohe", OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'),slice(0,4))
],remainder='passthrough')

# --  after encoding, with drop=first, there will be total 19 encoded columns + the 6 numerical cols


#Standard Scaling

scaler = ColumnTransformer([
    ('stdScaler',StandardScaler(),slice(0,25))
],remainder='passthrough')

In [None]:
pipe = Pipeline([
    ("one_hot_encoder", encoder),
    ("standard_Scaler", scaler)
])

pipe.fit(Xtrain)

In [None]:
Xtrain = pipe.fit_transform(Xtrain)
Xtest = pipe.fit_transform(Xtest)

print(f"Shape of cdf_Xtrain array: {Xtrain.shape}")
print(f"Shape of cdf_Xtest array: {Xtest.shape}")

print(f"Shape of cdf_Ytrain array: {Ytrain.shape}")
print(f"Shape of cdf_Ytest array: {Ytest.shape}")

In [None]:
# original test data
cdf_test_cleanned = pipe.fit_transform(cdf_test_cleanned)
print(f"Shape of cdf_test_cleanned array: {cdf_test_cleanned.shape}")

In [None]:
default_models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(),GradientBoostingRegressor(),
                  AdaBoostRegressor(),XGBRegressor(),LGBMRegressor(),CatBoostRegressor()]
names = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','Adaboost','XGB','Light gbm','Catboost']
model_result = []

for i,model in enumerate(default_models):
    start=time.time()
    
    model.fit(Xtrain,Ytrain)
    print(f"{names[i]} is ready to predict:~ \n---------------")
    y_pred = model.predict(Xtest)
    
    tt = time.time()-start
    r2 = round(r2_score(Ytest,y_pred),2)
    rmse = round(mean_squared_error(Ytest,y_pred,squared=False),2)
    mae = round(mean_absolute_error(Ytest,y_pred),2)
    
    result = [names[i],tt,r2,rmse,mae]
    model_result.append(result)
    
    print(f"RMSE on train data: {rmse}\n")
    print(f"{names[i]} is ready to predict the test data:~ \n---------------")
    sub = model.predict(cdf_test_cleanned)
    pd.DataFrame({'id':test_idx, 'price':sub}).to_csv(f"My Submission __{i+1}_shrinked_data_.csv", index=False)

shrinked_data_model_report = pd.DataFrame(model_result, columns=['model','time_taken','r2_score','rmse','mae'])


In [None]:
shrinked_data_model_report #color cols & clean title removed

In [None]:
lr = LinearRegression()
gb = GradientBoostingRegressor()
lgbm = LGBMRegressor()

lr.fit(Xtrain,Ytrain)
sub_lr = lr.predict(cdf_test_cleanned)

gb.fit(Xtrain,Ytrain)
sub_gb = gb.predict(cdf_test_cleanned)

lgbm.fit(Xtrain,Ytrain)
sub_lgbm = lgbm.predict(cdf_test_cleanned)

sub = sub_lr*0.2 + sub_gb*0.4 + sub_lgbm*0.4
pd.DataFrame({'id':test_idx, 'price':sub}).to_csv(f"My Submission __LR_GB_LGBM_shrank.csv", index=False)


In [None]:
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
xgb = XGBRegressor()
cat = CatBoostRegressor()

rf.fit(Xtrain,Ytrain)
sub_rf = rf.predict(cdf_test_cleanned)

gb.fit(Xtrain,Ytrain)
sub_gb = gb.predict(cdf_test_cleanned)

xgb.fit(Xtrain,Ytrain)
sub_xgb = xgb.predict(cdf_test_cleanned)

cat.fit(Xtrain,Ytrain)
sub_cat = cat.predict(cdf_test_cleanned)

sub = sub_rf*0.1 + sub_gb*0.4 + sub_xgb*0.2 + sub_cat*0.3
pd.DataFrame({'id':test_idx, 'price':sub}).to_csv(f"My Submission __RF_GB_XGB_CAT_shrank.csv", index=False)