# Importing Libraries

In [201]:
import pandas as pd
import numpy as np
import plotly.express as px

# Reading Data

In [202]:
df = pd.read_csv('Cars.csv')

# Exploring Data

In [203]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Brand,Model,Body,Color,Year,Fuel,Kilometers,Engine,Transmission,Price,Gov
13955,13428,Fiat,Shahin,Sedan,Burgundy,2011,Benzine,100000 to 119999,1400 - 1500 CC,Manual,61.0,Sharqia
7381,2501,Chevrolet,Optra,Sedan,Gray,2022,Benzine,10000 to 19999,1400 - 1500 CC,Automatic,265.0,Giza
12720,12193,Fiat,128,Sedan,Red,1993,Benzine,100000 to 119999,1000 - 1300 CC,Manual,32.2,Ismailia
5328,16695,Hyundai,Verna,Sedan,Black,2010,Benzine,More than 200000,1600 CC,Manual,109.2,Cairo
14087,13560,Fiat,Shahin,Sedan,Other Color,2001,Benzine,0 to 9999,1400 - 1500 CC,Manual,40.2,Cairo


In [204]:
df.columns

Index(['Unnamed: 0', 'Brand', 'Model', 'Body', 'Color', 'Year', 'Fuel',
       'Kilometers', 'Engine', 'Transmission', 'Price', 'Gov'],
      dtype='object')

In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14741 entries, 0 to 14740
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    14741 non-null  int64  
 1   Brand         14741 non-null  object 
 2   Model         14741 non-null  object 
 3   Body          14741 non-null  object 
 4   Color         14741 non-null  object 
 5   Year          14741 non-null  int64  
 6   Fuel          14741 non-null  object 
 7   Kilometers    14741 non-null  object 
 8   Engine        14741 non-null  object 
 9   Transmission  14741 non-null  object 
 10  Price         14741 non-null  float64
 11  Gov           14741 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 1.3+ MB


In [206]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Price
count,14741.0,14741.0,14741.0
mean,8934.846754,2005.456821,116.584987
std,4922.065495,12.655566,82.192718
min,812.0,1970.0,3.0
25%,4497.0,1998.0,43.7
50%,8182.0,2010.0,110.0
75%,13373.0,2015.0,161.0
max,17058.0,2022.0,471.5


# Data Cleaning

In [207]:
df_clean = df.copy()

In [208]:
df_clean.columns = df_clean.columns.str.lower()
df_clean.columns

Index(['unnamed: 0', 'brand', 'model', 'body', 'color', 'year', 'fuel',
       'kilometers', 'engine', 'transmission', 'price', 'gov'],
      dtype='object')

In [209]:
df_clean.drop(['unnamed: 0',], axis =1, inplace = True)

In [210]:
#Check Missing Values
df_clean.isna().sum()

brand           0
model           0
body            0
color           0
year            0
fuel            0
kilometers      0
engine          0
transmission    0
price           0
gov             0
dtype: int64

In [211]:
#Check Duplicated
df_clean.duplicated().sum()

1

In [212]:
df_clean.drop_duplicates(inplace=True)

In [213]:
df_clean.duplicated().sum()

0

In [214]:
df_clean.head()

Unnamed: 0,brand,model,body,color,year,fuel,kilometers,engine,transmission,price,gov
0,Hyundai,Accent,Sedan,Black,2007,Benzine,140000 to 159999,1600 CC,Automatic,140.0,Giza
1,Hyundai,Accent,Sedan,Silver,2005,Benzine,180000 to 199999,1000 - 1300 CC,Manual,78.0,Qena
2,Hyundai,Accent,Sedan,Gray,1999,Benzine,140000 to 159999,1400 - 1500 CC,Manual,70.0,Giza
3,Hyundai,Accent,Sedan,Blue- Navy Blue,2009,Benzine,140000 to 159999,1600 CC,Automatic,150.0,Cairo
4,Hyundai,Accent,Sedan,Silver,2000,Benzine,10000 to 19999,1000 - 1300 CC,Manual,75.0,Giza


In [215]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14740 entries, 0 to 14740
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         14740 non-null  object 
 1   model         14740 non-null  object 
 2   body          14740 non-null  object 
 3   color         14740 non-null  object 
 4   year          14740 non-null  int64  
 5   fuel          14740 non-null  object 
 6   kilometers    14740 non-null  object 
 7   engine        14740 non-null  object 
 8   transmission  14740 non-null  object 
 9   price         14740 non-null  float64
 10  gov           14740 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.3+ MB


In [216]:
 # Function to Multiply each Value by 1000
df_clean.price = df_clean.price.apply(lambda x : x * 1000)          
df_clean

Unnamed: 0,brand,model,body,color,year,fuel,kilometers,engine,transmission,price,gov
0,Hyundai,Accent,Sedan,Black,2007,Benzine,140000 to 159999,1600 CC,Automatic,140000.0,Giza
1,Hyundai,Accent,Sedan,Silver,2005,Benzine,180000 to 199999,1000 - 1300 CC,Manual,78000.0,Qena
2,Hyundai,Accent,Sedan,Gray,1999,Benzine,140000 to 159999,1400 - 1500 CC,Manual,70000.0,Giza
3,Hyundai,Accent,Sedan,Blue- Navy Blue,2009,Benzine,140000 to 159999,1600 CC,Automatic,150000.0,Cairo
4,Hyundai,Accent,Sedan,Silver,2000,Benzine,10000 to 19999,1000 - 1300 CC,Manual,75000.0,Giza
...,...,...,...,...,...,...,...,...,...,...,...
14736,Fiat,Uno,Hatchback,Burgundy,1997,Benzine,160000 to 179999,1000 - 1300 CC,Manual,46000.0,Cairo
14737,Fiat,Uno,Hatchback,Silver,1996,Benzine,20000 to 29999,1000 - 1300 CC,Manual,46000.0,Alexandria
14738,Fiat,Uno,Hatchback,Burgundy,1993,Benzine,10000 to 19999,1000 - 1300 CC,Manual,43700.0,Giza
14739,Fiat,Uno,Hatchback,Silver,1996,Benzine,10000 to 19999,1000 - 1300 CC,Manual,69000.0,Cairo


In [217]:
df_clean.kilometers = df_clean.kilometers.apply(lambda x : x.split()[-1])
df_clean.kilometers

0        159999
1        199999
2        159999
3        159999
4         19999
          ...  
14736    179999
14737     29999
14738     19999
14739     19999
14740     39999
Name: kilometers, Length: 14740, dtype: object

# Data Analysis

In [218]:
df_EDA =df_clean.copy()
#save in a csv file
df_EDA.to_csv('Cars_EDA.csv', index=False)

In [219]:
df_clean.head()

Unnamed: 0,brand,model,body,color,year,fuel,kilometers,engine,transmission,price,gov
0,Hyundai,Accent,Sedan,Black,2007,Benzine,159999,1600 CC,Automatic,140000.0,Giza
1,Hyundai,Accent,Sedan,Silver,2005,Benzine,199999,1000 - 1300 CC,Manual,78000.0,Qena
2,Hyundai,Accent,Sedan,Gray,1999,Benzine,159999,1400 - 1500 CC,Manual,70000.0,Giza
3,Hyundai,Accent,Sedan,Blue- Navy Blue,2009,Benzine,159999,1600 CC,Automatic,150000.0,Cairo
4,Hyundai,Accent,Sedan,Silver,2000,Benzine,19999,1000 - 1300 CC,Manual,75000.0,Giza


In [220]:
# What are the top 3 most expensive brands?
df_EDA.groupby('brand')['price'].max().sort_values(ascending=False).head(5).reset_index()
# The top 3 most expensive brands is(1"Hyundai",2"Fiat",3 "Chevrolet")


Unnamed: 0,brand,price
0,Fiat,471500.0
1,Chevrolet,460000.0
2,Hyundai,408200.0


In [221]:
# What are the top 5 most expensive model?
df_EDA.groupby('model')['price'].max().sort_values(ascending=False).reset_index().nlargest(5, 'price')
#The top 5 most expensive model is ("Tipo", "Elantra", "Cruze", "Avante","Accent")

Unnamed: 0,model,price
0,Tipo,471500.0
1,Cruze,460000.0
2,Elantra,408200.0
3,Verna,396800.0
4,Accent,385200.0


In [222]:
# What is the most common fuel type?
df_EDA.groupby('fuel')['brand'].count().sort_values(ascending=False).reset_index()
# The most common fuel type is ("Benzine")

Unnamed: 0,fuel,brand
0,Benzine,14199
1,Natural Gas,541


In [223]:
# What is the most common transmission type?
df_EDA.groupby('transmission')['brand'].count().sort_values(ascending=False).reset_index()
#The most common transmission type is ("Manual")

Unnamed: 0,transmission,brand
0,Manual,9861
1,Automatic,4879


In [224]:
# What is the most common body type?
df_EDA.groupby('body')['brand'].count().sort_values(ascending=False).reset_index()
#The most common body type ("Sedan")

Unnamed: 0,body,brand
0,Sedan,13452
1,Hatchback,1106
2,SUV,182


In [225]:
# What is the average price of each body type?
df_EDA.groupby('body')['price'].mean().reset_index()
#Average price of each body type of fuel relative to SUV(175298.35) and Sedan(116196.46) and Hatchback(111736.98)

Unnamed: 0,body,price
0,Hatchback,111736.980108
1,SUV,175298.351648
2,Sedan,116196.468927


In [226]:
#What is the average price of each fuel type?
df_EDA.groupby('fuel')['price'].mean().reset_index()
#Average price of each type of fuel relative to Benzine is(116500.81) and Natural Gas(118974.49)

Unnamed: 0,fuel,price
0,Benzine,116500.816959
1,Natural Gas,118974.491682


In [227]:
#What are the top 5 gov ?
df_EDA.gov.value_counts().reset_index().nlargest(5, 'count')
#The top 5 gov is (Cairo , Giza, Alexandria, Sharqia, Qalyubia)	

Unnamed: 0,gov,count
0,Cairo,4458
1,Giza,2411
2,Alexandria,1636
3,Sharqia,851
4,Qalyubia,806


In [228]:
#What is top 3 common car color?
df_EDA.color.value_counts().reset_index().nlargest(3, 'count')
#Top 3 common car color is(White, Black, Silver)

Unnamed: 0,color,count
0,White,2614
1,Black,2032
2,Silver,1952


In [229]:
# What is less common car model?
df_EDA.model.value_counts(ascending=True).reset_index().head(1)
#The less common car model is Excel.

Unnamed: 0,model,count
0,Excel,90


In [230]:
# what is the most year has been produced cars in?
df_EDA.groupby('year')['model'].count().reset_index().nlargest(3 , 'model')
#The most year has been produced cars in is "2013".

Unnamed: 0,year,model
43,2013,850
40,2010,763
41,2011,728


# Data Visualisation

In [231]:
#The code generates a histogram to visualize the distribution of different values in the 'Brand' column.
px.histogram(df , x = 'brand')

In [232]:
#The code generates a histogram to visualize the distribution of different values in the 'Model' column.
px.histogram(df , x = 'model')

In [233]:
#The code generates a histogram to visualize the distribution of different fuel types in the 'Fuel' column.
px.histogram(df , x = 'fuel')

In [234]:
#The code generates a histogram to visualize the distribution of transmission types in the 'Transmission' column.
px.histogram(df , x = 'transmission')


In [272]:
#The code generates a histogram to visualize the distribution of car prices in the 'Price' column.
px.histogram(df , x = 'price')

In [274]:
#The code generates a box plot to display the distribution, median, and potential outliers in the 'Price' column.
px.box(df , x = 'price')

In [237]:
# Subplots
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(px.histogram(df, x = 'price').data[0], row=1, col=1)
fig.add_trace(px.box(df, x = 'price').data[0], row=1, col=2)
fig.show()

In [275]:
#The code generates a histogram to visualize the distribution of engine sizes or types in the 'Engine' column.
px.histogram(df , x = 'engine')

In [276]:
#The code calculates and visualizes the average car price per brand using a bar chart, with colors representing different brands.
brand_price = df_clean.groupby('brand')['price'].mean().sort_values(ascending=False)
fig = px.bar(brand_price , color = brand_price.index )
fig.show()

In [240]:
#The code computes and visualizes the average car price for each model using a bar chart.
model_price = df_clean.groupby('model')['price'].mean().sort_values(ascending=False)
fig = px.bar(model_price , color = model_price.index )
fig.show()

In [241]:
#The code computes the average car price by fuel type, sorts it, and visualizes the results in a colored bar chart.
fuel_price = df_clean.groupby('fuel')['price'].mean().sort_values(ascending=False)
fig = px.bar(fuel_price , color = fuel_price.index )
fig.show()

In [242]:
#The code groups the data by transmission type, counts the number of prices for each type, sorts the counts, and visualizes the result with a bar chart, where each bar's color corresponds to the transmission type.
Transmission_price= df_EDA.groupby('transmission')['price'].count().sort_values(ascending=False)
fig = px.bar(Transmission_price , color = Transmission_price.index )
fig.show()

In [243]:
df.columns = df.columns.str.lower()
df.kilometers = df.kilometers.apply(lambda x : x.split()[-1])
df.kilometers

0        159999
1        199999
2        159999
3        159999
4         19999
          ...  
14736    179999
14737     29999
14738     19999
14739     19999
14740     39999
Name: kilometers, Length: 14741, dtype: object

In [244]:
df.kilometers = df["kilometers"].astype(int)
df.kilometers

0        159999
1        199999
2        159999
3        159999
4         19999
          ...  
14736    179999
14737     29999
14738     19999
14739     19999
14740     39999
Name: kilometers, Length: 14741, dtype: int32

In [245]:
#The code randomly selects 5 rows from the 'model' column of the DataFrame, returning a DataFrame with consistent results due to the fixed random seed
df[['model']].sample(5, random_state=42)

Unnamed: 0,model
9512,Optra
8014,Cruze
8596,Lanos
1714,Matrix
11210,Punto


In [246]:
#The code applies one-hot encoding to the 'model' column of a DataFrame and converts the result into a DataFrame.
#The .sample(5, random_state=42) function is used to get 5 random rows from the one-hot encoded DataFrame for review.

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

pd.DataFrame(ohe.fit_transform(df[['model']]), columns=ohe.get_feature_names_out()).sample(5, random_state=42) 

Unnamed: 0,model_128,model_131,model_Accent,model_Avante,model_Aveo,model_Cruze,model_Elantra,model_Excel,model_I10,model_Lanos,model_Matrix,model_Optra,model_Punto,model_Shahin,model_Tipo,model_Tucson,model_Uno,model_Verna
9512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8014,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [247]:
#The code performs binary encoding on the 'model' column of the DataFrame to convert categorical values into binary format.
from category_encoders import BinaryEncoder

b = BinaryEncoder()

b.fit_transform(df[['model']])


Unnamed: 0,model_0,model_1,model_2,model_3,model_4
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
14736,1,0,0,1,0
14737,1,0,0,1,0
14738,1,0,0,1,0
14739,1,0,0,1,0


In [248]:
#The code scales the 'kilometers' column of the DataFrame to a range between 0 and 1 using MinMaxScaler.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit_transform(df[['kilometers']])

array([[0.78946953],
       [0.99999474],
       [0.78946953],
       ...,
       [0.0526313 ],
       [0.0526313 ],
       [0.15789391]])

In [249]:
from sklearn.model_selection import train_test_split
# Data Splitting into features and target
X = df.drop('price', axis=1)
y = df.price
# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [250]:
#The code sets up and applies preprocessing pipelines for numerical and categorical features in the training data.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

cat_pipe = Pipeline(steps=[('encoder', BinaryEncoder())])

num_pipe = Pipeline(steps=[('scaler', MinMaxScaler())])

transformer = ColumnTransformer(transformers=[('num', num_pipe, ['year', 'kilometers']),
                                              ('cat', cat_pipe, ['brand', 'model', 'body', 'color', 'fuel', 'engine', 'transmission', 'gov'])])


X_train_preprocessed = transformer.fit_transform(X_train)
X_test_preprocessed = transformer.transform(X_test)

# Cross validation

In [252]:
# Importing LinearRegression, cross_val_score, and r2_score from scikit-learn.
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
# Initializing a Linear Regression model.
model = LinearRegression()
# Evaluating the model using 5-fold cross-validation and printing the scores as percentages.
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
# Printing the mean cross-validation score as a percentage.
scores.mean() * 100

[78.2065253  76.65910418 77.54093209 78.74523139 76.16296863]


77.46295231822164

In [253]:
# Ridge Regression
model = Ridge(alpha=0.0001)

model.fit(X_train_preprocessed, y_train)


scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[78.20652441 76.65910636 77.54093168 78.74523061 76.16296888]


77.4629523881432

In [277]:
model = Ridge(alpha=0.5)

model.fit(X_train_preprocessed, y_train)

scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[78.20169757 76.66955853 77.53853362 78.74106034 76.16388043]


77.46294609740441

In [254]:
# Lasso Regression
model = Lasso(alpha=0.0001)

model.fit(X_train_preprocessed, y_train)
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[78.20643095 76.65927017 77.54101802 78.74511851 76.16297059]


77.46296164618218

In [256]:
# Importing SVR (Support Vector Regression) from scikit-learn.
from sklearn.svm import SVR
# Initializing a Support Vector Regressor with a polynomial kernel.
model = SVR(kernel='poly')
# Evaluating the model using 5-fold cross-validation and printing the scores as percentages.
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
# Printing the mean cross-validation score as a percentage.
scores.mean() * 100

[86.2889373  84.17525169 85.48756048 87.61955603 85.01984971]


85.71823104332228

In [257]:
# Initializing a Support Vector Regressor with a radial basis function (RBF) kernel.
model = SVR(kernel='rbf')
# Evaluating the model using 5-fold cross-validation and printing the scores as percentages.
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
# Printing the mean cross-validation score as a percentage.
scores.mean() * 100

[69.80930873 70.52759172 70.17009836 71.67364725 69.01069213]


70.23826763521875

In [258]:
# Importing DecisionTreeRegressor from scikit-learn for decision tree regression.
from sklearn.tree import DecisionTreeRegressor
# Initializing a Decision Tree Regressor with default parameters.
model = DecisionTreeRegressor()
# Evaluating the model using 5-fold cross-validation and printing the scores as percentages.
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
# Printing the mean cross-validation score as a percentage.
scores.mean() * 100

[86.33152278 84.25312853 86.83962818 87.92447113 85.02613263]


86.07497664912584

In [259]:
# Importing DecisionTreeRegressor from scikit-learn for decision tree regression.
from sklearn.tree import DecisionTreeRegressor
# Initializing a Decision Tree Regressor with a maximum depth of 5 to prevent overfitting.
model = DecisionTreeRegressor(max_depth=5)
# Evaluating the model using 5-fold cross-validation and printing the scores as percentages.
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
# Printing the mean cross-validation score as a percentage.
scores.mean() * 100

[86.1448104  83.60895288 85.00088349 86.62654379 84.30745595]


85.1377293037536

In [260]:
model = DecisionTreeRegressor(max_depth=7)

scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[90.02947436 87.27127532 87.79464258 89.97667408 86.90581374]


88.39557601422827

In [261]:
model = DecisionTreeRegressor(max_depth=7)
scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[90.03180836 87.26783273 87.79464258 89.97667408 86.90581374]


88.39535429809663

In [262]:
model = DecisionTreeRegressor(max_depth=9)

scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[90.48514192 87.83696253 89.74075106 90.59970263 87.05219229]


89.142950086945

In [263]:
model = DecisionTreeRegressor(max_depth=11)

scores = cross_val_score(model, X_train_preprocessed, y_train, cv=5)
print(scores * 100)
scores.mean() * 100

[90.8593534  86.47280309 88.72139644 90.2329048  87.25873777]


88.70903910080635

# Grid Search

In [264]:
# Importing GridSearchCV from scikit-learn for hyperparameter tuning.
from sklearn.model_selection import GridSearchCV
# Defining the parameter grid for hyperparameter tuning: testing different values for 'max_depth'.
param_grid = {'max_depth': [3, 5, 7, 9, 11]}
# Initializing GridSearchCV with the Decision Tree model, parameter grid, and 5-fold cross-validation.
grid = GridSearchCV(model, param_grid, cv=5)
# Fitting the GridSearchCV object to the preprocessed training data to find the best hyperparameters.
grid.fit(X_train_preprocessed, y_train)

In [265]:
print(grid.best_score_ * 100)
print(grid.best_params_)

89.24922869895153
{'max_depth': 9}


In [266]:
# Importing r2_score from scikit-learn to evaluate the model's performance.
from sklearn.metrics import r2_score
# Initializing a Decision Tree Regressor with a maximum depth of 9.
model = DecisionTreeRegressor(max_depth=9)
# Fitting the model to the preprocessed training data.
model.fit(X_train_preprocessed, y_train)
# Making predictions on the preprocessed test data.
y_pred = model.predict(X_test_preprocessed)
# Calculating and printing the R^2 score of the model's predictions as a percentage.
r2_score(y_test, y_pred) * 100

90.7058903643616

# Test 

In [267]:
model = DecisionTreeRegressor(max_depth=11)

model.fit(X_train_preprocessed, y_train)

y_pred = model.predict(X_test_preprocessed)

r2_score(y_test, y_pred) * 100

89.81342090798027

# Saving

In [268]:
#save preprocessor 
import pickle
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(transformer, f)

In [269]:
#save model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)