# Neural networks training

## Basic imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## TensorFlow imports

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

2023-07-04 22:18:59.378037: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-04 22:18:59.435049: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-04 22:18:59.436348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data loading

In [3]:
# Load dataset
df = pd.read_csv('finalDataset.csv')
# Drop the first column
df = df.drop(df.columns[0], axis=1)
# Drop rows where price is NaN
df.dropna(subset=['Price'], inplace=True)
# Separate the features and target variable
X = df.drop('Price', axis=1)
y = df['Price']

### Specific preprocessing 1 for the dataset

In [4]:
# Drop columns with non-NA count less than threshold that is more than half the rows number
X = X.dropna(axis=1, thresh=40000)
X['Fiscal power'] = X['Fiscal power'].str.strip().replace('Plus de 41', 41, regex=True)
X['Fiscal power'].replace(['--'], np.nan, inplace=True)
X['Fiscal power'] = X['Fiscal power'].str.strip().replace('CV', '', regex=True)
# Convert non numerical values
X['Mileage'] = X['Mileage'].str.replace(' ', '')
X['Mileage'] = X['Mileage'].str.replace('Plusde500000', '500000-1000000')
X['Model Year'] = X['Model Year'].str.replace('1980 ou plus ancien', '1980')
# Remove spaces
X[['lower_mileage', 'upper_mileage']] = X['Mileage'].str.split('-', expand=True)
X = X.drop(['Mileage'], axis=1)
X

Unnamed: 0,Model Year,Brand,Model,Fuel type,Fiscal power,lower_mileage,upper_mileage
0,2012,Peugeot,206+,Essence,6,90000,94999
1,2011,Peugeot,Partner,Diesel,7,25000,29999
2,2010,Dacia,Sandero,Diesel,6,250000,299999
4,2008,Daihatsu,Sirion,Essence,,150000,159999
5,2014,Dacia,Duster,Diesel,6,170000,179999
...,...,...,...,...,...,...,...
64852,2001,,,Diesel,9,,
64853,2019,,,,,10000,14999
64854,2007,,,Diesel,10,,
64855,2018,,,,,0,4999


## Preporcessing imports

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

## Creation of the Column Transformers

In [6]:
#1st Imputation Transformer
imput_trf = ColumnTransformer([
        ('impute_num',SimpleImputer(strategy='median'), [0, 4, 5, 6]),
        ('impute_cat_ord',SimpleImputer(strategy='most_frequent'), []),
        ('impute_cat_non_ord',SimpleImputer(strategy='most_frequent'), [1, 2, 3]),
    ],remainder='passthrough')

#2nd Encoding
encode_trf = ColumnTransformer([
    ('one_hot_encod', OneHotEncoder(sparse=False, handle_unknown='ignore'), [4, 5, 6]),
    ('ord_encod', OrdinalEncoder(), []),
], remainder='passthrough')

#3rd Scaling
scale_trf= ColumnTransformer([
    ('one_hot_encod', StandardScaler(), [895, 896, 897, 898]),

], remainder='passthrough')




## Pipeline creation

In [7]:
# Creation of the pipeline
pipe = Pipeline([
    ('imput_trf', imput_trf),
    ('encode_trf', encode_trf),
    ('scale_trf', scale_trf),
])


## Fitting the pipeline to the data

In [8]:
# Apply preprocessing to the data
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')
#fit data
pipe.fit(X)



## Applying the pipeline to the data

In [9]:
# Applying the pipeline to the data
X_preprocessed = pipe.transform(X)
X_preprocessed_df = pd.DataFrame(X_preprocessed)


# **************************************************************************

## This part to get the columns to apply to the pipeline

# **************************************************************************

In [10]:
# Assuming you have a transformer object called 'transformer'
# Applying the transformer to a DataFrame 'df'
X_imput = imput_trf.fit_transform(X)

# Convert the transformed data back to a DataFrame
X_imput_df = pd.DataFrame(X_imput)
X_imput_df


Unnamed: 0,0,1,2,3,4,5,6
0,2012.0,6.0,90000.0,94999.0,Peugeot,206+,Essence
1,2011.0,7.0,25000.0,29999.0,Peugeot,Partner,Diesel
2,2010.0,6.0,250000.0,299999.0,Dacia,Sandero,Diesel
3,2008.0,7.0,150000.0,159999.0,Daihatsu,Sirion,Essence
4,2014.0,6.0,170000.0,179999.0,Dacia,Duster,Diesel
...,...,...,...,...,...,...,...
61692,2001.0,9.0,110000.0,119999.0,Renault,Logan,Diesel
61693,2019.0,7.0,10000.0,14999.0,Renault,Logan,Diesel
61694,2007.0,10.0,110000.0,119999.0,Renault,Logan,Diesel
61695,2018.0,7.0,0.0,4999.0,Renault,Logan,Diesel


In [11]:
X_imput_df.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

In [12]:
# Assuming you have a transformer object called 'transformer'
# Applying the transformer to a DataFrame 'df'
X_encode = encode_trf.fit_transform(X_imput)

# Convert the transformed data back to a DataFrame
X_encode_df = pd.DataFrame(X_encode)
X_encode_df




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,889,890,891,892,893,894,895,896,897,898
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2012.0,6.0,90000.0,94999.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2011.0,7.0,25000.0,29999.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2010.0,6.0,250000.0,299999.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,2008.0,7.0,150000.0,159999.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2014.0,6.0,170000.0,179999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2001.0,9.0,110000.0,119999.0
61693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2019.0,7.0,10000.0,14999.0
61694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2007.0,10.0,110000.0,119999.0
61695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,2018.0,7.0,0.0,4999.0


# **************************************************************************

## Division of the dataset to training/test

In [13]:

from sklearn.model_selection import train_test_split

# Divide using train test split function
X_train, X_test, y_train, y_test = train_test_split(
  X,y , random_state=1,test_size=0.2, shuffle=True)

In [14]:
from sklearn.neural_network import MLPRegressor
import joblib

In [15]:
#mlp = MLPRegressor(hidden_layer_sizes=(60), max_iter=1000)
#mlp.fit(X_train, y_train)
#y_predict = mlp.predict(X_test)

#Saving the machine learning model to a file
#joblib.dump(mlp, "price_model.pkl")


In [21]:
# Assuming your input features are 'feature1', 'feature2', 'feature3'
X_test = pd.DataFrame({'Year': [2012], 'Brand': ['Peugeot'], 'Model': ['208'], 'Fuel': ['Diesel'], 'Power': [6], 'Lower_mileag': [100000], 'Upper_mileage': [120000]})
X_test

Unnamed: 0,Year,Brand,Model,Fuel,Power,Lower_mileag,Upper_mileage
0,2012,Peugeot,208,Diesel,6,100000,120000


In [18]:
# Apply the preprocessing pipeline to X_test
X_test_preprocessed = pipe.transform(X_test)


# Create a DataFrame with the preprocessed data
X_test_preprocessed = pd.DataFrame(X_test_preprocessed)
X_test_preprocessed

KeyError: "None of [Index(['Model Year', 'Fiscal power', 'lower_mileage', 'upper_mileage'], dtype='object')] are in the [columns]"

In [None]:


# Load the trained model from the pickle file
mlp = joblib.load("price_model.pkl")

# Make predictions on X_test
y_predict = mlp.predict(X_test_preprocessed)

In [22]:
X_imput_test = imput_trf.fit_transform(X_test)
X_imput_test

array([[2012.0, 6.0, 100000.0, 120000.0, 'Peugeot', '208', 'Diesel']],
      dtype=object)

In [23]:
# Assuming you have a transformer object called 'transformer'
# Applying the transformer to a DataFrame 'df'
X_encode_test = encode_trf.fit_transform(X_imput_test)

# Convert the transformed data back to a DataFrame
X_encode_test = pd.DataFrame(X_encode_test)
X_encode_test



array([[1.0, 1.0, 1.0, 2012.0, 6.0, 100000.0, 120000.0]], dtype=object)

# Here the problem is the encoder doesn't do it with all the categorical values available on the train dataframe but the test dataframe