In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle as pkl
import json

In [3]:
data = pd.read_csv(r"dataset\cars_dataset.csv")
data

FileNotFoundError: [Errno 2] No such file or directory: 'dataset\\cars_dataset.csv'

In [None]:
data.info()

In [None]:
data.drop(columns = ["Unnamed: 0"],inplace = True)

In [None]:
data.info()

In [None]:
data.describe()


In [None]:
%matplotlib inline
data.hist(figsize = (10,10),bins = 50)

In [None]:
plt.ticklabel_format(useOffset=False,style = "plain",axis = "x")
#list_x = np.arange(0,300000,100000)
#plt.xticks(list_x)
data["mileage"].plot(kind = "hist",bins = 100,figsize = (10,5))
plt.show()

# Preparing training and testing datasets

In [None]:
train_set,test_set = train_test_split(data, test_size = 0.2,random_state = 42)

In [None]:
data["mileage_driven"] = pd.cut(data["mileage"],
                                bins =[0,20000,40000,60000,80000,100000,np.inf],
                               ).astype(str)


In [None]:
plt.figure(figsize = (10,5))

data["mileage_driven"].hist()

In [None]:
data["mileage_driven"].value_counts()

In [None]:
"""
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train_set, train_set["mileage_driven"]):
strat_train_set = train_set.loc[train_index]
strat_test_set = train_set.loc[test_index]
"""

In [None]:
#train_set["mileage_driven"].value_counts() / len(train_set)

In [None]:
#test_set["mileage_driven"].value_counts() / len(test_set)

In [None]:
data.drop("mileage_driven", axis=1, inplace=True)

In [None]:
data

# Visualization

In [None]:
cars = train_set.copy()

In [None]:
cars

In [None]:
cor = cars.corr()
cor["price"].sort_values()

In [None]:
attr = ["mileage","year","price"]
scatter_matrix(cars[attr],figsize = (12,10))

In [None]:
cars.plot(kind = "scatter",x = "mileage",y = "price",alpha = 0.1)

In [None]:
cars.plot(kind = "scatter",x = "year",y = "price",alpha = 0.1)

In [None]:
plt.figure(figsize = (20,10))
sns.countplot(data = cars,x = "brand")


In [None]:
cars["brand"].value_counts()

In [None]:
plt.figure(figsize = (10,8))
sns.countplot(data = cars,x = "title_status")

In [None]:
avg_price_by_state = cars.groupby("state")["price"].apply("mean")
avg_num_cars_by_state = cars["state"].value_counts()
cols = {
    "avg_price":avg_price_by_state,
    "avg_num_cars":avg_num_cars_by_state
}
pd.DataFrame(cols)

# Preprocessing the data

In [None]:
temp = pd.Series(np.where(train_set["price"] > 0,">0 Price","==0 Price"))
prct = temp.value_counts()
prct

In [None]:
class null_price_drop(BaseEstimator,TransformerMixin):
    def __init__(self,data = None):
        self.column = data
                
    def fit(self,X,y = None):
        return self
    
    def transform(self,X,y = None):
        df = X.copy()
        df = df[df["price"]!= 0]
       
        return df
        
    
    def fit_transform(self,X,y = None):
        return self.fit(X,y).transform(X,y)

In [None]:
null_pipe = null_price_drop()
train_set = null_pipe.transform(train_set)
#train_set

In [None]:
#testing null_price_drop transformer
temp = pd.Series(np.where(train_set["price"] > 0,">0 Price","==0 Price"))
prct = temp.value_counts()
prct

In [None]:
class group_brand(BaseEstimator,TransformerMixin):
    def __init__(self,columns = None):
        self.columns = columns
    
    def fit(self,X,y = None):
        return self
    
    def transform(self,X,y = None):
        df = X.copy()
        brand_stat = df.groupby("brand")["brand"].agg("count")
        brand_less_10 = brand_stat[brand_stat < 10]
        df["brand"] = df["brand"].apply(lambda x:"other" if x in brand_less_10 else x)
        return df
    
    def fit_transform(self,X,y = None):
        return self.fit(X,y).transform(X,y)

In [None]:
# testing group_brand transformer
"""
grouping = group_brand()
cars_in_grouped = grouping.transform(cars_in)
cars_in_grouped["brand"].unique()
"""

In [None]:
#len(cars_in_grouped["brand"].unique())

In [None]:
grouping_and_null = Pipeline([
    ("dropping_null",null_price_drop()),
    ("grouping_brand",group_brand())
])

train_set = grouping_and_null.fit_transform(train_set)

In [None]:
train_set

### Splitting X and Y values

In [None]:
cars_in = train_set.drop("price",axis = 1)
cars_out = train_set[["price"]]

In [None]:
# Using the fit method to specify the columns to be dropped in the Custom Transformer
"""
class column_drop_transformer(BaseEstimator,TransformerMixin):
    def __init__(self,columns = None):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        self.columns = X
        return self 
        
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
column_drop_pipeline = column_drop_transformer()
a = column_drop_pipeline.fit(["vin","lot","condition"])
cars_in_drop = column_drop_pipeline.transform(cars_in)
cars_in_drop
"""

In [None]:
class features_transformer(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        """
        removing specified and remaining columns since ColumnTranformer "remainder = passthrough" will 
        duplicate the untouched columns(remaining columns) as this transformer is returning a DataFrame
        """
        remaining_columns = [col for col in X.columns if col not in self.columns]
        return pd.DataFrame(X.drop(columns= remaining_columns ))
        #return pd.DataFrame(X.drop(columns = self.columns,axis = 1))

    def fit(self, X, y=None):
       return self 

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
cars_in

In [None]:
#testing features_transformer
#features_pipeline = features_transformer(["state","model","color","vin","lot","country","condition"])
#cars_in_drop = features_pipeline.transform(cars_in_grouped)
#cars_in_drop

In [None]:
temp = pd.Series(np.where(train_set["mileage"] == 0,"== 0 Mileage",">0 Mileage"))
prct = temp.value_counts()
prct

In [None]:
prct.plot(kind = "barh")

In [None]:
numerical_pipe = Pipeline([
    ('impute',SimpleImputer(strategy = "median")),
    ("scaler",StandardScaler())
])


In [None]:
one_hot = ["brand"]
ordinal = ["title_status"]

categorical_pipe = ColumnTransformer([
    ("one_hot",OneHotEncoder(sparse = False),one_hot),
    ("ordinal_enc",OrdinalEncoder(),ordinal),
])



In [None]:
#["brand","year","title_status","mileage"]
#["state","model","color","vin","lot","country","condition"]
cust_pipeline = Pipeline([
   
   ("column_drop",features_transformer(["brand","year","title_status","mileage"])),
])

In [None]:
#cars_in.columns.values.tolist()
cat = ["title_status","brand"]
num =  ["year","mileage"]

final_pipeline = ColumnTransformer([
    #("column_drop",column_drop_transformer(["state","model","color","vin","lot","country","condition"]),al),
    #("custom_trans",cust_pipeline,list(cars_in.columns)),
    ("num",numerical_pipe,num),
    ("cat",categorical_pipe,cat)
],remainder = 'drop')



In [None]:
#cars_in_drop = cust_pipeline.transform(cars_in)
cars_prepared = final_pipeline.fit_transform(cars_in)
cars_prepared

In [None]:
cars_prepared[0]

In [None]:
cars_prepared.shape

In [None]:
cars_out.shape

# Training a Model

In [None]:
def display_scores(rmse_result):
   
    print("Scores : ",rmse_result)
    print("Mean : ",rmse_result.mean())
    print("SD : ",rmse_result.std())    

In [None]:
reg_model = LinearRegression()
reg_model.fit(cars_prepared,cars_out)

In [None]:
rand_data = cars_prepared[:5]
rand_data_out = cars_out[:5]
rand_pred = reg_model.predict(rand_data)
np.concatenate((rand_pred.reshape(-1,1),rand_data_out.values.reshape(-1,1)),1)

In [None]:
cars_predicted_lin = reg_model.predict(cars_prepared)
mse_lin = mean_squared_error(cars_predicted_lin,cars_out)
rmse_lin = np.sqrt(mse_lin)
rmse_lin

In [None]:
tree_model = DecisionTreeRegressor()
tree_model.fit(cars_prepared,cars_out)


In [None]:
cars_predicted_tree = tree_model.predict(cars_prepared)
mse_tree = mean_squared_error(cars_predicted_tree,cars_out)
rmse_tree = np.sqrt(mse_tree)
rmse_tree

In [None]:
random_forest = RandomForestRegressor(n_estimators = 100)
random_forest.fit(cars_prepared,cars_out)

In [None]:
cars_predicted_forest = random_forest.predict(cars_prepared)
mse_rand_forest = mean_squared_error(cars_predicted_forest,cars_out)
rmse_forest = np.sqrt(mse_rand_forest)
rmse_forest

In [None]:
lin_scores = cross_val_score(reg_model,
                            X = cars_prepared,
                            y = cars_out,
                            cv = 10,
                            scoring = "neg_mean_squared_error")

rmse_lin_scores = np.sqrt(-lin_scores) 
display_scores(rmse_lin_scores)
#lin_scores

In [None]:
tree_scores = cross_val_score(tree_model,
                              X = cars_prepared,
                              y = cars_out,
                              cv = 10,
                              scoring ="neg_mean_squared_error"
                             )
rmse_tree_scores = np.sqrt(-tree_scores)
display_scores(rmse_tree_scores)

In [None]:
forest_scores = cross_val_score(random_forest,
                              X = cars_prepared,
                              y = cars_out,
                              cv = 10,
                              scoring ="neg_mean_squared_error"
                             )
rmse_forest_scores = np.sqrt(-forest_scores)
display_scores(rmse_forest_scores)

# Fine tuning the best model

In [None]:
forest_param = [
    {"n_estimators":[60,90,120,150,200],"max_features":[1,2,3]},
    {"n_estimators":[60,90,120,150,200],"bootstrap":[False],"max_features":[1,2,3]}
]

forest_grid = GridSearchCV(random_forest,
                           param_grid = forest_param,
                           scoring = "neg_mean_squared_error",
                          cv = 10)

forest_grid.fit(cars_prepared,cars_out)

In [None]:
forest_grid.best_params_ , np.sqrt(-forest_grid.best_score_)

In [None]:
scores = forest_grid.cv_results_
for score,params in zip(scores["mean_test_score"],scores["params"]):
    print(np.sqrt(-score),params)

# Evaluation on test data

In [None]:
test_set = grouping_and_null.transform(test_set)

In [None]:
test_set

In [None]:
x_test = test_set.drop("price",axis = 1)
y_test = test_set["price"]

In [None]:
#x_test_drop = cust_pipeline.transform(x_test)
x_test_developed = final_pipeline.transform(x_test)

In [None]:
x_test_developed, x_test_developed.shape 

In [None]:
pred_x_test = reg_model.predict(x_test_developed)
mse_x_test = mean_squared_error(pred_x_test,y_test)


In [None]:
rmse_x_test = np.sqrt(mse_x_test)
rmse_x_test

# Creating pickle file and columns file

In [None]:
pkl.dump(reg_model,open("lin_model","wb"))

In [None]:
get_brand_cols = final_pipeline.named_transformers_["cat"].named_transformers_["one_hot"].get_feature_names_out()
get_status_cols = final_pipeline.named_transformers_["cat"].named_transformers_["ordinal_enc"].categories_
get_brand_cols,get_status_cols


In [None]:
brand_columns = {
    "columns": [col for col in get_brand_cols]
}

status_columns = {
    "columns":[col for col in get_status_cols[0]]
}
brand_columns["columns"],status_columns["columns"]


In [None]:

with open("brand_columns.json","w") as file:
    file.write(json.dumps(brand_columns["columns"]))

with open("status_columns.json","w") as file:
    file.write(json.dumps(status_columns["columns"]))


In [None]:
y_test.iloc[0]

In [None]:
l = pd.DataFrame({
    "brand":["nissan"],
    "model":["doors"],
    "year":[2015],
    "title_status":["clean vehicle"],
    "mileage":[37953],
    "color" :["super black"],
    "vin":["1n4az0cp7fc314468"],
    
    "state":["new york"],
    "country":["usa"],
    "condition"  :["11 days left"]
})
user = final_pipeline.transform(l)
#final_pipeline.transform([a])
user

In [None]:
reg_model.predict(user)