In [582]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re


In [583]:
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

In [584]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_info_columns', 500)
pd.set_option('display.max_info_rows', 2000)
pd.set_option('display.expand_frame_repr', True)
pd.set_option('display.width', 2000)

In [585]:
pd.set_option('display.max_columns', None)

In [586]:
scout_raw = pd.read_json('scout_car.json', lines=True)
scout = pd.read_json('scout_car.json', lines=True)

In [587]:
scout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 54 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   url                            object 
 1   make_model                     object 
 2   short_description              object 
 3   body_type                      object 
 4   price                          int64  
 5   vat                            object 
 6   km                             object 
 7   registration                   object 
 8   prev_owner                     object 
 9   kW                             float64
 10  hp                             object 
 11  Type                           object 
 12  Previous Owners                object 
 13  Next Inspection                object 
 14  Inspection new                 object 
 15  Warranty                       object 
 16  Full Service                   object 
 17  Non-smoking Vehicle            object 
 18  null  

In [588]:
scout.shape

(15919, 54)

In [589]:
scout.describe()

Unnamed: 0,price,kW
count,15919.0,0.0
mean,18019.896727,
std,7386.169409,
min,13.0,
25%,12850.0,
50%,16900.0,
75%,21900.0,
max,74600.0,


## Missing Value Check

First of all let's examine the percentage of missing values.

In [590]:
(100 - scout.isnull().sum()*100/scout.shape[0]).sort_values()

kW                                 0.000000
Last Timing Belt Service Date      0.100509
Electricity consumption            0.860607
Available from                     1.708650
Last Service Date                  3.555500
Availability                       3.988944
Other Fuel Types                   5.527985
Next Inspection                   22.206169
Inspection new                    24.700044
Emission Label                    25.032979
Model Code                        31.270808
Non-smoking Vehicle               45.084490
Country version                   47.653747
Full Service                      51.605000
Weight                            56.190715
Drive chain                       56.919404
prev_owner                        57.107859
Previous Owners                   58.288837
Paint Type                        63.741441
Cylinders                         64.319367
Warranty                          65.952635
Gears                             70.400151
vat                             

## Dealing With Insufficient & Irrelevant Columns

We are going to consantrate on the columns that have high percentage of missing values. To do so we assume columns that have more than 35% missing values as columns of interest. 

In [591]:
def columns_interest(data, limit):
    interest = data.isnull().sum()*100/data.shape[0]
    return (interest.loc[lambda x : x > limit]).sort_values()
columns_interest(scout, 35)

Cylinders                         35.680633
Paint Type                        36.258559
Previous Owners                   41.711163
prev_owner                        42.892141
Drive chain                       43.080596
Weight                            43.809285
Full Service                      48.395000
Country version                   52.346253
Non-smoking Vehicle               54.915510
Model Code                        68.729192
Emission Label                    74.967021
Inspection new                    75.299956
Next Inspection                   77.793831
Other Fuel Types                  94.472015
Availability                      96.011056
Last Service Date                 96.444500
Available from                    98.291350
Electricity consumption           99.139393
Last Timing Belt Service Date     99.899491
kW                               100.000000
dtype: float64

- We can drop columns that have 90% or more missing values and investigate the rest of the columns of interest for the further phases of project.

- 'Other Fuel Types', 'Availability', 'Last Service Date', 'Available from', 'Electricity consumption', 'Last Timing Belt Service Date' and 'kW' will be dropped.

In [592]:
drop_lst_interest = ['Other Fuel Types', 'Availability', 'Last Service Date', 'Available from', 'Electricity consumption', 'Last Timing Belt Service Date', 'kW']

In [593]:
scout.drop(columns = drop_lst_interest, axis = 1, inplace = True)

Now we are going to investigate each column of interest.

### scout['Cylinders']

In [594]:
columns_interest(scout, 35).Cylinders

35.680633205603364

In [595]:
scout.Cylinders.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n4\n]    8105
NaN        5680
[\n3\n]    2104
[\n5\n]      22
[\n6\n]       3
[\n2\n]       2
[\n8\n]       2
[\n1\n]       1
Name: Cylinders, dtype: int64

In [596]:
scout.Cylinders.str[0].str.strip().value_counts(dropna = False)

4      8105
NaN    5680
3      2104
5        22
6         3
8         2
2         2
1         1
Name: Cylinders, dtype: int64

In [597]:
scout['Cylinders'] = scout.Cylinders.str[0].str.strip()

We are going to keep the column and deal with the missing values in further phases of the project

### scout['Paint Type']

In [598]:
columns_interest(scout, 35)['Paint Type']

36.258558954708214

In [599]:
scout['Paint Type'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nMetallic\n]       9794
NaN                  5772
[\nUni/basic\n]       347
[\nPerl effect\n]       6
Name: Paint Type, dtype: int64

In [600]:
scout['Paint Type'].str[0].str.strip().value_counts(dropna = False)

Metallic       9794
NaN            5772
Uni/basic       347
Perl effect       6
Name: Paint Type, dtype: int64

In [601]:
scout['Paint_Type'] = scout['Paint Type'].str[0].str.strip()
scout.drop(columns = 'Paint Type', axis = 1, inplace = True)

### scout['Previous Owners'] & scout['prev_owner']

These two columns seems to contain similar data we are going to assess them together.

In [602]:
columns_interest(scout, 35)['Previous Owners']

41.71116276147999

In [603]:
columns_interest(scout, 35)['prev_owner']

42.8921414661725

In [604]:
scout['Previous Owners'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


\n1\n                                                                                        8101
NaN                                                                                          6640
\n2\n                                                                                         766
\n0\n                                                                                         163
\n3\n                                                                                          17
                                                                                             ... 
[\n1\n, \n181 g CO2/km (comb)\n]                                                                1
[\n1\n, \n, 6.1 l/100 km (comb), \n, 7.7 l/100 km (city), \n, 5.2 l/100 km (country), \n]       1
[\n1\n, \nEuro 6\n]                                                                             1
[\n1\n, \n, 5.9 l/100 km (comb), \n, 7.6 l/100 km (city), \n, 4.9 l/100 km (country), \n]       1
[\n1\n, \n102 g CO2/

#### scout['Previous Owners'

In [605]:
scout['Previous Owners'].str.strip().value_counts(dropna = False)

1      8101
NaN    6870
2       766
0       163
3        17
4         2
Name: Previous Owners, dtype: int64

In [606]:
scout['Previous Owners'].str[0].str[1].value_counts(dropna = False)

NaN    15689
1        193
0         25
2         12
Name: Previous Owners, dtype: int64

In [607]:
scout['Previous_Owners'] = scout['Previous Owners'].str.strip()

In [608]:
scout['Previous_Owners_add'] = scout['Previous Owners'].str[0].str[1]

- Now that we found additional useful previous owner values we are going to fill main column with the additional values.

In [609]:
scout.Previous_Owners.fillna(scout.Previous_Owners_add, inplace = True)

In [610]:
scout.Previous_Owners.value_counts(dropna = False)

1      8294
NaN    6640
2       778
0       188
3        17
4         2
Name: Previous_Owners, dtype: int64

In [611]:
scout['prev_owner'].value_counts(dropna = False)

1 previous owner     8294
NaN                  6828
2 previous owners     778
3 previous owners      17
4 previous owners       2
Name: prev_owner, dtype: int64

In [612]:
scout['prev_owner'].str[0].value_counts(dropna = False)

1      8294
NaN    6828
2       778
3        17
4         2
Name: prev_owner, dtype: int64

In [613]:
scout['prev_owner'] = scout['prev_owner'].str[0].value_counts(dropna = False)

In [614]:
scout.drop(columns = ['prev_owner', 'Previous Owners', 'Previous_Owners_add'], inplace = True)

As 'Previous Owners' contains same values including 188 first owner (0) we dropped 'prev_owner' and also 'Previous Owners' and 'Previous_Owners_add' are no longer needed to be kept.

### scout['Drive chain']

In [615]:
columns_interest(scout, 35)['Drive chain']

43.08059551479364

In [616]:
scout['Drive chain'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nfront\n]    8886
NaN            6858
[\n4WD\n]       171
[\nrear\n]        4
Name: Drive chain, dtype: int64

In [617]:
scout['Drive chain'].str[0].str.strip().value_counts(dropna = False)

front    8886
NaN      6858
4WD       171
rear        4
Name: Drive chain, dtype: int64

In [618]:
scout['drive_chain'] = scout['Drive chain'].str[0].str.strip()

In [619]:
scout.drop(columns = 'Drive chain', inplace = True)

### scout['Weight']

In [620]:
columns_interest(scout, 35).Weight

43.8092845027954

In [621]:
scout.Weight.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN               6974
[\n1,163 kg\n]     574
[\n1,360 kg\n]     356
[\n1,165 kg\n]     301
[\n1,335 kg\n]     242
                  ... 
[\n1,030 kg\n]       1
[\n1,206 kg\n]       1
[\n1,492 kg\n]       1
[\n1,057 kg\n]       1
[\n1,939 kg\n]       1
Name: Weight, Length: 435, dtype: int64

In [622]:
scout.Weight.str[0].str.strip().str.extract('(\d,*\d*)')[0].str.replace(',', '').value_counts(dropna = False)

NaN     6974
1163     574
1360     356
1165     301
1335     242
        ... 
1648       1
1397       1
1792       1
2044       1
1523       1
Name: 0, Length: 435, dtype: int64

In [623]:
scout['Weight'] = scout.Weight.str[0].str.strip().str.extract('(\d,*\d*)')[0].str.replace(',', '')

Regarding its high Null value percentage and intuitively being not price related factor we will drop 'Weight' column.

In [624]:
scout.drop(columns = 'Weight', inplace = True)

## scout['Full Service']

In [625]:
scout['Full Service'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                           7704
[\n, \n, \n4 (Green)\n]                                                                       2235
[\n, \n, \nEuro 6\n]                                                                          2097
[\n, \n]                                                                                      1702
[\n, \n, \nEuro 6d-TEMP\n]                                                                     399
                                                                                              ... 
[\n, \n, \n, 6 l/100 km (comb), \n, 7.5 l/100 km (city), \n, 5.2 l/100 km (country), \n]         1
[\n, \n, \n, 5.5 l/100 km (comb), \n, 7.2 l/100 km (city), \n, 4.5 l/100 km (country), \n]       1
[\n, \n, \n, 5.8 l/100 km (comb), \n, 7.4 l/100 km (city), \n, 4.9 l/100 km (country), \n]       1
[\n, \n, \n, 5.4 l/100 km (comb), \n, 7 l/100 km (city), \n, 4.5 l/100 km (country), \n]         1
[\n, \n, \

According to the high null value percentage and non related content we will drop this column

In [626]:
scout.drop(columns = ['Full Service'], inplace = True)

## scout['Country version']

In [627]:
scout['Country version'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                     8333
[\nGermany\n]           4502
[\nItaly\n]             1038
[\nEuropean Union\n]     507
[\nNetherlands\n]        464
[\nSpain\n]              325
[\nBelgium\n]            314
[\nAustria\n]            208
[\nCzech Republic\n]      52
[\nPoland\n]              49
[\nFrance\n]              38
[\nDenmark\n]             33
[\nHungary\n]             28
[\nJapan\n]                8
[\nSlovakia\n]             4
[\nCroatia\n]              4
[\nSweden\n]               3
[\nRomania\n]              2
[\nBulgaria\n]             2
[\nSerbia\n]               1
[\nLuxembourg\n]           1
[\nSwitzerland\n]          1
[\nSlovenia\n]             1
[\nEgypt\n]                1
Name: Country version, dtype: int64

In [628]:
scout['country'] = scout['Country version'].str[0].str.strip()

In [629]:
scout.drop(columns = ['Country version'], inplace = True)

## scout['Non-smoking Vehicle']

In [630]:
scout['Non-smoking Vehicle'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                           8742
[\n, \n]                                                                                      3647
[\n, \n, \n4 (Green)\n]                                                                       1240
[\n, \n, \nEuro 6\n]                                                                          1127
[\n, \n, \nEuro 6d-TEMP\n]                                                                     345
                                                                                              ... 
[\n, \n, \n101 g CO2/km (comb)\n]                                                                1
[\n, \n, \n122 g CO2/km (comb)\n]                                                                1
[\n, \n, \n159 g CO2/km (comb)\n]                                                                1
[\n, \n, \n, 6.2 l/100 km (comb), \n, 7.9 l/100 km (city), \n, 5.3 l/100 km (country), \n]       1
[\n, \n, \

In [631]:
columns_interest(scout, 35)['Non-smoking Vehicle']

54.91550976820152

More than half of the column consist of null values and the content is not intuitive. So, we will drop 'Non-smoking Vehicle' column.

In [632]:
scout.drop(columns = ['Non-smoking Vehicle'], inplace = True)

## scout['Model Code']

In [633]:
scout['Model Code'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN               10941
[\n0035/BCB\n]      268
[\n0588/BNO\n]      245
[\n0588/BDB\n]      206
[\n0588/BHX\n]      188
                  ...  
[\n1844/AEN\n]        1
[\n0588/AVR\n]        1
[\n0035/AFF\n]        1
[\n1844/AFM\n]        1
[\n0035/AVN\n]        1
Name: Model Code, Length: 233, dtype: int64

In [634]:
columns_interest(scout, 35)['Model Code']

68.72919153213141

More than 68% of the column consist of null values and the content is not intuitive. So, we will drop 'Model Code' column.

In [635]:
scout.drop(columns = ['Model Code'], inplace = True)

## scout['Emission Label']

In [636]:
scout['Emission Label'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                     11934
[\n4 (Green)\n]          3553
[\n1 (No sticker)\n]      381
[[], [], []]               40
[\n5 (Blue)\n]              8
[\n3 (Yellow)\n]            2
[\n2 (Red)\n]               1
Name: Emission Label, dtype: int64

In [637]:
scout['Emission Label'].str[0].str.strip().value_counts(dropna = False)

NaN               11974
4 (Green)          3553
1 (No sticker)      381
5 (Blue)              8
3 (Yellow)            2
2 (Red)               1
Name: Emission Label, dtype: int64

In [638]:
columns_interest(scout, 35)['Emission Label']

74.9670205414913

In [639]:
scout['emission_label'] = scout['Emission Label'].str[0].str.strip()

In [640]:
scout.drop(columns = ['Emission Label'], inplace = True)

## scout['Inspection new']

scout['Inspection new'].value_counts(dropna = False)

In [641]:
scout['Inspection new'].str[0].value_counts(dropna = False)

NaN        11987
\nYes\n     3570
\n           362
Name: Inspection new, dtype: int64

In [642]:
inspection_lst = [''.join(map(str, item)) if isinstance(item, list) else item for item in scout['Inspection new']] 

In [643]:
inspection_serie = pd.Series(inspection_lst)

In [644]:
inspection_serie.str.extract('(Yes)')[0].value_counts(dropna = False)

NaN    11987
Yes     3932
Name: 0, dtype: int64

In [645]:
scout['inspection_new'] = inspection_serie.str.extract('(Yes)')[0].str.contains('Yes', na = False) * 1

Due to the high null percentage droppping 'Inspection' should be considered. However this intuitiveley seems to effect price. For now we will keep it.

In [646]:
scout.drop(columns = ['Inspection new'], inplace = True)

## scout['Next Inspection']

In [647]:
scout['Next Inspection'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                              12384
\n04/2022\n                                                                                         62
\n03/2021\n                                                                                         38
\n03/2022\n                                                                                         36
\n06/2021\n                                                                                         34
                                                                                                 ...  
[\n08/2020\n, \n153 g CO2/km (comb)\n]                                                               1
[\n04/2021\n, \n103 g CO2/km (comb)\n]                                                               1
[\n10/2019\n, \n, 5.3 l/100 km (comb), \n, 7 l/100 km (city), \n, 4.3 l/100 km (country), \n]        1
[\n01/2020\n, \n123 g CO2/km (comb)\n]                                   

In [648]:
next_inspection_lst = [''.join(map(str, item)) if isinstance(item, list) else item for item in scout['Next Inspection']]

In [649]:
next_inspection_serie = pd.Series(next_inspection_lst)

In [650]:
next_inspection_serie.value_counts(dropna = False)

NaN                                                                                12384
\n04/2022\n                                                                           62
\n06/2021\n                                                                           47
\n03/2021\n                                                                           39
\n03/2022\n                                                                           36
                                                                                   ...  
\n04/2021\n\n105 g CO2/km (comb)\n                                                     1
\n01/2021\n\n0 kWh/100 km (comb)\n                                                     1
\n03/2022\n\n6 l/100 km (comb)\n7.8 l/100 km (city)\n4.9 l/100 km (country)\n          1
\n01/2022\n\n6.1 l/100 km (comb)\n7.5 l/100 km (city)\n5.2 l/100 km (country)\n        1
\n04/2020\n\n115 g CO2/km (comb)\n                                                     1
Length: 1385, dtype: 

In [651]:
next_inspection_serie.str.extract('(\d{2}/\d{4})')[0].value_counts(dropna = False)

NaN        12384
06/2021      471
03/2021      210
05/2021      180
04/2021      171
           ...  
05/2014        1
11/2022        1
05/2017        1
03/2024        1
01/1999        1
Name: 0, Length: 78, dtype: int64

In [652]:
scout['next_inspection'] = next_inspection_serie.str.extract('(\d{2}/\d{4})')[0]

In [653]:
scout['next_inspection'] = pd.to_datetime(scout.next_inspection, format = '%m/%Y')

In [654]:
scout.drop(columns = ['Next Inspection'], inplace = True)

## scout['short_description']

In [655]:
scout['short_description'].value_counts(dropna = False)

SPB 1.6 TDI 116 CV S tronic Sport                 64
NaN                                               46
1.4 66kW (90CV) Selective                         40
MOVE KLIMA CD USB ALLWETTER BLUETOOTH             38
SPB 30 TDI S tronic Business                      35
                                                  ..
220CV 5 porte R.S. Trophy FULL PELLE rs            1
1.6 CDTi 136cv Sports Tourer Innovation            1
1.2 75CV 5 porte Intensive b/gpl                   1
SPB 30TDI 116cv S-TRONIC Business L. 32.559€       1
SPB 1.6 TDI 116 CV S tronic Sport NAVI SENSORI     1
Name: short_description, Length: 10002, dtype: int64

In [656]:
scout['cc_from_description'] = scout['short_description'].str.extract('\D*([0-3]{1}\.\d{1})[\D ]')

In [657]:
scout.drop(columns = ['short_description'], inplace = True)

In [658]:
scout['cc_from_description'] = scout['cc_from_description'] * 1000

## scout['Warranty']

In [659]:
scout.Warranty.isnull().sum()/scout.Warranty.shape[0]*100

34.047364784220115

In [660]:
scout.Warranty.isnull().sum()

5420

In [661]:
scout.Warranty.value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                         5420
[\n, \n, \nEuro 6\n]                        1868
\n12 months\n                               1177
\n                                           979
\n24 months\n                                566
                                            ... 
[\n24 months\n, \n128 g CO2/km (comb)\n]       1
[\n60 months\n, \n98 g CO2/km (comb)\n]        1
[\n20 months\n, \n139 g CO2/km (comb)\n]       1
[\n10 months\n, \n104 g CO2/km (comb)\n]       1
[\n16 months\n, \n116 g CO2/km (comb)\n]       1
Name: Warranty, Length: 516, dtype: int64

In [662]:
scout.Warranty.value_counts(dropna = False).sample(50)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n60 months\n, \n92 g CO2/km (comb)\n]                                                                2
\n                                                                                                   979
[\n24 months\n, \n106 g CO2/km (comb)\n]                                                               7
[\n36 months\n, \n108 g CO2/km (comb)\n]                                                               2
[\n12 months\n, \n107 g CO2/km (comb)\n]                                                              25
[\n72 months\n, \n156 g CO2/km (comb)\n]                                                               2
[\n24 months\n, \n132 g CO2/km (comb)\n]                                                               3
[\n12 months\n, \n, 5.9 l/100 km (comb), \n, 7.5 l/100 km (city), \n, 4.9 l/100 km (country), \n]      1
[\n60 months\n, \n138 g CO2/km (comb)\n]                                                               2
[\n12 months\n, \n82 g CO2/km (comb)\n]                

In [663]:
warranty_lst = [''.join(item).strip() if isinstance(item,list) else item for item in scout['Warranty']]

In [664]:
scout['warranty_mnth'] = pd.DataFrame(warranty_lst)

In [665]:
scout['warranty_mnth'] = scout.warranty_mnth.str.extract('(\d{1,3}) months')

In [666]:
scout['warranty_mnth'].value_counts(dropna = 0)

NaN    11066
12      2594
24      1118
60       401
36       279
48       149
6        125
72        59
3         33
23        11
18        10
20         7
25         6
2          5
26         4
50         4
16         4
34         3
4          3
13         3
1          3
19         3
14         2
28         2
22         2
45         2
9          2
46         2
11         2
21         2
17         2
49         1
10         1
7          1
65         1
15         1
33         1
47         1
56         1
40         1
30         1
8          1
Name: warranty_mnth, dtype: int64

In [667]:
scout['warranty_mnth'].isnull().sum()/scout['warranty_mnth'].shape[0]*100

69.51441673471952

Since 69% is null we will drop 'Warranty' and 'warranty_mnth'. We also drop non related 'url' column.

In [668]:
scout.drop(columns = ['warranty_mnth', 'Warranty', 'url'], inplace = True)

## scout['Make']

In [669]:
scout['Make'].value_counts(dropna = False)

\nOpel\n       7343
\nAudi\n       5712
\nRenault\n    2864
Name: Make, dtype: int64

In [670]:
scout['Make'] = scout['Make'].str.strip()

In [671]:
scout['Make'].value_counts()

Opel       7343
Audi       5712
Renault    2864
Name: Make, dtype: int64

## scout['Model']

In [672]:
scout['Model'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, A3, \n]          3097
[\n, A1, \n]          2614
[\n, Insignia, \n]    2598
[\n, Astra, \n]       2526
[\n, Corsa, \n]       2219
[\n, Clio, \n]        1839
[\n, Espace, \n]       991
[\n, Duster, \n]        34
[\n, A2, \n]             1
Name: Model, dtype: int64

In [673]:
scout['Model'] = scout.Model.str[1]

## scout['Body Color']

In [674]:
scout['Body Color'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Black, \n]     3745
[\n, Grey, \n]      3505
[\n, White, \n]     3406
[\n, Silver, \n]    1647
[\n, Blue, \n]      1431
[\n, Red, \n]        957
[\n, Brown, \n]      289
[\n, Green, \n]      154
[\n, Beige, \n]      108
[\n, Yellow, \n]      51
[\n, Violet, \n]      18
[\n, Bronze, \n]       6
[\n, Orange, \n]       3
[\n, Gold, \n]         2
Name: Body Color, dtype: int64

In [675]:
scout['body_color'] = scout['Body Color'].str[1].str.strip()

In [676]:
scout.body_color.fillna(method = 'bfill', inplace = True)

In [677]:
scout.body_color.value_counts(dropna = False)

Black     3888
Grey      3638
White     3540
Silver    1687
Blue      1524
Red        989
Brown      299
Green      163
Beige      108
Yellow      53
Violet      18
Bronze       7
Orange       3
Gold         2
Name: body_color, dtype: int64

## scout['Body Color Original']

In [678]:
scout['Body Color Original'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                             3759
[\nOnyx Schwarz\n]               338
[\nBianco\n]                     282
[\nMythosschwarz Metallic\n]     238
[\nBrillantschwarz\n]            216
                                ... 
[\nBianca - Tetto Nero\n]          1
[\ndezir rot\n]                    1
[\nnero/tetto argento met\n]       1
[\nPython Yellow Metallic\n]       1
[\nkarbongrau\n]                   1
Name: Body Color Original, Length: 1928, dtype: int64

In [679]:
scout.drop(columns='Body Color Original', inplace = True)

## scout['Upholstery']

In [680]:
scout['Upholstery'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nCloth, Black\n]           5821
NaN                          3720
[\nPart leather, Black\n]    1121
[\nCloth\n]                  1005
[\nCloth, Grey\n]             891
[\nCloth, Other\n]            639
[\nFull leather, Black\n]     575
[\nBlack\n]                   491
[\nGrey\n]                    273
[\nOther, Other\n]            182
[\nPart leather\n]            140
[\nFull leather\n]            139
[\nFull leather, Brown\n]     116
[\nPart leather, Grey\n]      116
[\nOther, Black\n]            110
[\nFull leather, Other\n]      72
[\nFull leather, Grey\n]       67
[\nPart leather, Other\n]      65
[\nOther\n]                    56
[\nPart leather, Brown\n]      50
[\nalcantara, Black\n]         47
[\nVelour, Black\n]            36
[\nFull leather, Beige\n]      36
[\nCloth, Brown\n]             28
[\nVelour\n]                   16
[\nOther, Grey\n]              15
[\nCloth, Beige\n]             13
[\nCloth, Blue\n]              12
[\nBrown\n]                    12
[\nVelour, Gre

In [681]:
scout['Upholstery'].isnull().sum()/scout['Upholstery'].shape[0]*100

23.368302029021923

In [682]:
scout['Upholstery'] = scout.Upholstery.str[0].str.strip()

In [683]:
## scout['Nr. of Doors']

In [684]:
scout['Nr. of Doors'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n5\n]    11575
[\n4\n]     3079
[\n3\n]      832
[\n2\n]      219
NaN          212
[\n7\n]        1
[\n1\n]        1
Name: Nr. of Doors, dtype: int64

In [685]:
scout['door_number'] = scout['Nr. of Doors'].str[0].str.strip()

In [686]:
scout.drop(columns = 'Nr. of Doors', inplace = True)

## scout['Nr. of Seats']

In [687]:
scout['Nr. of Seats'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n5\n]    13336
[\n4\n]     1125
NaN          977
[\n7\n]      362
[\n2\n]      116
[\n6\n]        2
[\n3\n]        1
Name: Nr. of Seats, dtype: int64

In [688]:
scout['seat_number'] = scout['Nr. of Seats'].str[0].str.strip()

In [689]:
scout.drop(columns = 'Nr. of Seats', inplace = True)

## scout['Gearing Type']

In [690]:
scout['Gearing Type'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Manual, \n]            8153
[\n, Automatic, \n]         7297
[\n, Semi-automatic, \n]     469
Name: Gearing Type, dtype: int64

In [691]:
scout['gear_type'] = scout['Gearing Type'].str[1]

In [692]:
scout.drop(columns = 'Gearing Type', inplace = True)

## scout['Displacement']

In [693]:
scout['Displacement'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n1,598 cc\n]     4761
[\n999 cc\n]       2438
[\n1,398 cc\n]     1314
[\n1,399 cc\n]      749
[\n1,229 cc\n]      677
                   ... 
[\n1,800 cc\n]        1
[\n140 cc\n]          1
[\n15,898 cc\n]       1
[\n1,686 cc\n]        1
[\n1,368 cc\n]        1
Name: Displacement, Length: 78, dtype: int64

In [694]:
scout['Displacement'] = scout.Displacement.str[0].str.strip()

In [695]:
scout.Displacement.replace({',':'', ' cc':''}, regex = True, inplace = True)

In [696]:
scout['Displacement'].isnull().sum()

496

In [697]:
scout['Displacement'].fillna(scout['cc_from_description'], inplace = True)

In [698]:
scout.drop(columns= ['cc_from_description'], inplace = True)

We have extract 315 additional displacement values which are null in this column fron short decription and filled in regarding rows.

## scout['Fuel']

In [699]:
scout['Fuel'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Diesel (Particulate Filter), \n]                                                                                  4315
[\n, Super 95, \n]                                                                                                     3338
[\n, Gasoline, \n]                                                                                                     3175
[\n, Diesel, \n]                                                                                                       2984
[\n, Super 95 / Regular/Benzine 91, \n]                                                                                 424
                                                                                                                       ... 
[\n, Super Plus 98 / Super E10 95, \n]                                                                                    1
[\n, Regular/Benzine 91 / Super 95 / Regular/Benzine E10 91 / Super E10 95 / Super Plus 98 / Super Plus E10 98, \n]       1
[\n, Sup

In [700]:
scout.Fuel.str[1].value_counts(dropna = False)

Diesel (Particulate Filter)                                                                                  4315
Super 95                                                                                                     3338
Gasoline                                                                                                     3175
Diesel                                                                                                       2984
Super 95 / Regular/Benzine 91                                                                                 424
                                                                                                             ... 
Super 95 / Super Plus 98 / Super Plus E10 98                                                                    1
Super 95 / Regular/Benzine 91 / Super E10 95 / Super Plus E10 98 / Super Plus 98 / Regular/Benzine E10 91       1
Super 95 / Super E10 95 / Super Plus E10 98                                             

In [701]:
scout.Fuel.str[1].value_counts(dropna = False).sample(40)

Regular/Benzine 91 / Super 95 / Super Plus 98 / Regular/Benzine E10 91 / Super Plus E10 98 / Super E10 95       4
Regular/Benzine 91 / Super 95 / Super E10 95                                                                    1
Super 95 / Super E10 95 / Super Plus E10 98 / Super Plus 98                                                     4
Diesel (Particulate Filter)                                                                                  4315
Super E10 95 / Super Plus E10 98                                                                                4
Super 95 / Super E10 95 / Super Plus E10 98                                                                     1
Regular/Benzine 91 (Particulate Filter)                                                                       100
Regular/Benzine E10 91 / Regular/Benzine 91 / Super 95 / Super Plus 98 / Super E10 95 / Super Plus E10 98       1
Super E10 95 / Regular/Benzine 91                                                       

In [702]:
scout['fuel_new'] =  scout.Fuel.str[1]

In [703]:
diesel_check = scout.fuel_new.str.contains('diesel', case = False, regex = True)

In [704]:
alternative_check = scout.fuel_new.str.contains('lpg|cng|domestic|electric|others|bio', case = False, regex = True)

In [705]:
scout.loc[diesel_check, 'fuel_new'] = 'Diesel'

In [706]:
scout.loc[alternative_check, 'fuel_new'] = 'alternative_fuel'

In [707]:
scout.fuel_new.loc[lambda x : x != 'Diesel'][lambda x : x != 'alternative_fuel']

1                                             Gasoline
9                                             Gasoline
11                                            Super 95
12                                  Regular/Benzine 91
13                                            Gasoline
                             ...                      
15908                                         Gasoline
15910                           Regular/Benzine E10 91
15913                                         Super 95
15915    Super 95 / Super Plus 98 (Particulate Filter)
15918                                         Super 95
Name: fuel_new, Length: 8544, dtype: object

In [708]:
benzine_index = list(scout.fuel_new.loc[lambda x : x != 'Diesel'][lambda x : x != 'alternative_fuel'].index)

In [709]:
scout.fuel_new.iloc[benzine_index] = 'Benzine'

In [710]:
scout.fuel_new.value_counts(dropna = False)

Benzine             8544
Diesel              7299
alternative_fuel      76
Name: fuel_new, dtype: int64

In [711]:
scout.drop(columns = 'Fuel', inplace = True)

## scout['Consumption']

In [712]:
scout['Consumption'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                   1906
[[3.9 l/100 km (comb)], [4.1 l/100 km (city)], [3.7 l/100 km (country)]]               304
[[4.2 l/100 km (comb)], [5 l/100 km (city)], [3.7 l/100 km (country)]]                 276
[[5.4 l/100 km (comb)], [6.8 l/100 km (city)], [4.5 l/100 km (country)]]               257
[[3.8 l/100 km (comb)], [4.3 l/100 km (city)], [3.5 l/100 km (country)]]               253
                                                                                      ... 
[[3.6 l/100 km (comb)], [], [4.4 l/100 km (country)]]                                    1
[\n, 4.8 l/100 km (comb), \n, 5.6 l/100 km (city), \n, 4.3 l/100 km (country), \n]       1
[[7.6 l/100 km (comb)], [], []]                                                          1
[[5.6 l/100 km (comb)], [7.6 l/100 km (city)], [4.4 l/100 km (country)]]                 1
[\n, 4.7 l/100 km (comb), \n, \n, \n]                                                    1

In [713]:
count = 0 
for i in scout['Consumption']:
    if type(i) != list:
        count +=1
count

1906

In [714]:
consumtion_str = [','.join(map(str, l)) if isinstance(l,list) else l for l in scout['Consumption']]

In [715]:
count = 0 
for i in consumtion_str:
    if type(i) != list:
        count +=1
count

15919

In [716]:
consumtion_str_serie = pd.Series(consumtion_str)

In [717]:
type(consumtion_str_serie[0])

str

In [757]:
consumtion_str_serie.value_counts(dropna = False).sample(55)

['5.3 l/100 km (comb)'],['6.7 l/100 km (city)'],['4.4 l/100 km (country)']     1
['4.3 l/100 km (comb)'],['5 l/100 km (city)'],['3.8 l/100 km (country)']       3
['4.2 l/100 km (comb)'],['6.5 l/100 km (city)'],['4.4 l/100 km (country)']     7
['6 l/100 km (comb)'],['7.8 l/100 km (city)'],['4.9 l/100 km (country)']      86
['6.6 l/100 km (comb)'],['7.4 l/100 km (city)'],['5.4 l/100 km (country)']     1
['5 l/100 km (comb)'],['5.5 l/100 km (city)'],['3.9 l/100 km (country)']      17
['32 l/100 km (comb)'],[],[]                                                   1
['7.6 l/100 km (comb)'],['9.9 l/100 km (city)'],['6.4 l/100 km (country)']    12
\n,5.7 l/100 km (comb),\n,7.2 l/100 km (city),\n,4.8 l/100 km (country),\n     6
\n,4 l/100 km (comb),\n,3.8 l/100 km (city),\n,4.4 l/100 km (country),\n       1
['7 l/100 km (comb)'],['9.6 l/100 km (city)'],['5.5 l/100 km (country)']       2
['5.6 l/100 km (comb)'],['7.3 l/100 km (city)'],['4.6 l/100 km (country)']    36
\n,5.7 l/100 km (comb),\n,7.

In [776]:
consumption_comb = consumtion_str_serie.str.extract('(\d+\.*\d*).*comb')

In [777]:
consumption_city = consumtion_str_serie.str.extract('(\d+\.*\d*).*city')

In [778]:
consumption_country = consumtion_str_serie.str.extract('(\d+\.*\d*).*country')

In [779]:
consumtion_str_serie[0]

"['3.8 l/100 km (comb)'],['4.3 l/100 km (city)'],['3.5 l/100 km (country)']"

In [780]:
consumption_comb.isnull().sum()

0    2033
dtype: int64

In [781]:
consumption_city.isnull().sum()

0    2436
dtype: int64

In [782]:
consumption_country.isnull().sum()

0    2376
dtype: int64

In [783]:
scout['consumption_comb'] = consumption_comb

In [784]:
scout['consumption_city'] = consumption_city

In [786]:
scout['consumption_country'] = consumption_country

In [787]:
scout.drop(columns = 'Consumption', inplace = True)

## scout['Type']

In [803]:
scout['Type'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[, Used, , Diesel (Particulate Filter)]                                                                                  3475
[, Used, , Diesel]                                                                                                       2516
[, Used, , Gasoline]                                                                                                     2367
[, Used, , Super 95]                                                                                                     1818
[, Pre-registered, , Super 95]                                                                                            500
                                                                                                                         ... 
[, Used, , Regular/Benzine E10 91 / Super E10 95 / Super Plus E10 98 / Super Plus 98 / Super 95 / Regular/Benzine 91]       1
[, Used, , Regular/Benzine E10 91 / Regular/Benzine 91 / Super 95 / Super Plus 98 / Super E10 95 / Super Plus E10 98] 

In [804]:
scout['Type'].str[1].value_counts(dropna = False)

Used              11096
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
NaN                   2
Name: Type, dtype: int64

In [805]:
scout['Type'] = scout['Type'].str[1]

## scout['null']

In [809]:
scout['null'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]    15919
Name: null, dtype: int64

In [811]:
scout.drop(columns = ['null'], inplace = True)

## scout['Offer Number']

In [815]:
scout['Offer Number'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                       3175
[\nXJ38068\n]               27
[\nHM53619\n]               27
[\nLT67679\n]               27
[\nJV03654\n]               27
                          ... 
[\nE84_474_6130_158\n]       1
[\n101\n]                    1
[\n418394\n]                 1
[\nrv290009\n]               1
[\nG02394\n]                 1
Name: Offer Number, Length: 11441, dtype: int64

In [822]:
scout['Offer Number'].str[0].str.strip()

0             LR-062483
1                   NaN
2              AM-95365
3                   NaN
4                 C1626
              ...      
15914          10988301
15915       507370_3223
15916          Espace16
15917           2691331
15918    Re_30000008029
Name: Offer Number, Length: 15919, dtype: object

Since the content is not intuitive we will drop the 'Offer Number' column.

In [823]:
scout.drop(columns = ['Offer Number'], inplace = True)

## scout['First Registration']

In [825]:
scout['First Registration'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, 2018, \n]    4522
[\n, 2016, \n]    3674
[\n, 2017, \n]    3273
[\n, 2019, \n]    2853
NaN               1597
Name: First Registration, dtype: int64

In [827]:
scout['First Registration'].str[1].value_counts(dropna = False)

2018    4522
2016    3674
2017    3273
2019    2853
NaN     1597
Name: First Registration, dtype: int64

In [828]:
scout['first_registration'] = scout['First Registration'].str[1]

In [829]:
scout.drop(columns = ['First Registration'], inplace = True)

## scout['Body Color']

In [831]:
scout['Body Color'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Black, \n]     3745
[\n, Grey, \n]      3505
[\n, White, \n]     3406
[\n, Silver, \n]    1647
[\n, Blue, \n]      1431
[\n, Red, \n]        957
NaN                  597
[\n, Brown, \n]      289
[\n, Green, \n]      154
[\n, Beige, \n]      108
[\n, Yellow, \n]      51
[\n, Violet, \n]      18
[\n, Bronze, \n]       6
[\n, Orange, \n]       3
[\n, Gold, \n]         2
Name: Body Color, dtype: int64

In [832]:
scout['Body Color'].str[1].value_counts(dropna = False)

Black     3745
Grey      3505
White     3406
Silver    1647
Blue      1431
Red        957
NaN        597
Brown      289
Green      154
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: Body Color, dtype: int64

In [833]:
scout['body_color'] = scout['Body Color'].str[1]

In [834]:
scout.drop(columns = ['Body Color'], inplace = True)

## scout['Body']

In [836]:
scout['Body'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\n, Sedans, \n]           7903
[\n, Station wagon, \n]    3553
[\n, Compact, \n]          3153
[\n, Van, \n]               783
[\n, Other, \n]             290
[\n, Transporter, \n]        88
NaN                          60
[\n, Off-Road, \n]           56
[\n, Coupe, \n]              25
[\n, Convertible, \n]         8
Name: Body, dtype: int64

In [837]:
scout['Body'].str[1].value_counts(dropna = False)

Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: Body, dtype: int64

In [838]:
scout['Body'] = scout['Body'].str[1]

## scout['CO2 Emission']

In [840]:
scout['CO2 Emission'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                            1808
[\n120 g CO2/km (comb)\n]       740
[[], [], []]                    628
[\n99 g CO2/km (comb)\n]        545
[\n97 g CO2/km (comb)\n]        537
                               ... 
[\n193 g CO2/km (comb)\n]         1
[\n171 g CO2/km (comb)\n]         1
[\n990 g CO2/km (comb)\n]         1
[\n1,060 g CO2/km (comb)\n]       1
[\n183 g CO2/km (comb)\n]         1
Name: CO2 Emission, Length: 124, dtype: int64

In [846]:
scout['CO2 Emission'].str[0].str.strip().value_counts(dropna = False)

NaN                      2436
120 g CO2/km (comb)       740
99 g CO2/km (comb)        545
97 g CO2/km (comb)        537
104 g CO2/km (comb)       501
                         ... 
165 g CO2/km (comb)         1
181 g CO2/km (comb)         1
45 g CO2/km (comb)          1
1,060 g CO2/km (comb)       1
990 g CO2/km (comb)         1
Name: CO2 Emission, Length: 123, dtype: int64

In [857]:
scout['CO2 Emission'].str[0].str.strip().str.extract('(\d+.{0,1}\d*)')[0].value_counts(dropna = False)

NaN       2436
120        740
99         545
97         537
104        501
          ... 
45           1
190          1
14           1
177          1
14,457       1
Name: 0, Length: 123, dtype: int64

In [859]:
scout['CO2_Emission'] = scout['CO2 Emission'].str[0].str.strip().str.extract('(\d+.{0,1}\d*)')[0].str.replace(',','')

In [860]:
scout['CO2_Emission'].unique()

array(['99 ', '129 ', '109 ', '92 ', '98 ', '97 ', nan, '105 ', '112 ',
       '103 ', '102 ', '95 ', '104 ', '91 ', '94 ', '117 ', '123 ',
       '106 ', '108 ', '121 ', '107 ', '101 ', '113 ', '137 ', '100 ',
       '116 ', '114 ', '118 ', '331 ', '115 ', '119 ', '90 ', '136 ',
       '134 ', '110 ', '111 ', '120 ', '89 ', '142 ', '126 ', '122 ',
       '128 ', '127 ', '138 ', '130 ', '125 ', '85 ', '124 ', '152 ',
       '88 ', '189 ', '194 ', '149 ', '153 ', '188 ', '36 ', '1060',
       '96 ', '990 ', '146 ', '135 ', '158 ', '12087', '141 ', '172 ',
       '154 ', '150 ', '167 ', '174 ', '93 ', '133 ', '131 ', '145 ',
       '147 ', '156 ', '87 ', '5 ', '148 ', '139 ', '151 ', '144 ',
       '168 ', '160 ', '170 ', '80 ', '132 ', '155 ', '14 ', '159 ', '0 ',
       '143 ', '140 ', '82 ', '12324', '84 ', '165 ', '51 ', '157 ',
       '169 ', '166 ', '253 ', '164 ', '175 ', '190 ', '161 ', '239 ',
       '197 ', '184 ', '14457', '199 ', '13983', '187 ', '181 ', '186 ',
       '177 '

In [864]:
scout['CO2_Emission'] = scout['CO2_Emission'].str.replace(' ', '')

In [865]:
scout.CO2_Emission.median()

116.0

We have extreme otliers. Filling out null values with median seems to be logical.

In [866]:
scout.CO2_Emission.fillna(scout.CO2_Emission.median(), inplace = True)

In [867]:
scout.drop(columns = ['CO2 Emission'], inplace = True)

## scout['Emission Class']

In [870]:
scout['Emission Class'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[\nEuro 6\n]          10139
NaN                    3021
[\nEuro 6d-TEMP\n]     1845
[[], [], []]            607
[\nEuro 6c\n]           127
[\nEuro 5\n]             78
[\nEuro 6d\n]            62
[\nEuro 4\n]             40
Name: Emission Class, dtype: int64

In [871]:
scout['Emission Class'].str[0].str.strip().value_counts(dropna = False)

Euro 6          10139
NaN              3628
Euro 6d-TEMP     1845
Euro 6c           127
Euro 5             78
Euro 6d            62
Euro 4             40
Name: Emission Class, dtype: int64

In [872]:
scout['emission_class'] = scout['Emission Class'].str[0].str.strip()

In [874]:
scout['emission_class'].replace({'Euro 6d-TEMP':'Euro 6', 'Euro 6c':'Euro 6', 'Euro 6d':'Euro 6'}, inplace = True)

In [875]:
scout.emission_class.value_counts(dropna = False)

Euro 6    12173
NaN        3628
Euro 5       78
Euro 4       40
Name: emission_class, dtype: int64

In [876]:
scout.drop(columns = ['Emission Class'], inplace = True)

## scout['\nComfort & Convenience\n']

In [879]:
scout['\nComfort & Convenience\n'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                                                                                                                                                                                                                                                                                                     920
[Air conditioning, Electrical side mirrors, Hill Holder, Power windows]                                                                                                                                                                                                                                                                                                 216
[Air conditioning, Electrical side mirrors, Power windows]                                                                                                                                                                                                                      

In [889]:
def unique_value_list(df, column):
    '''
    Iterates over a column to obtain a unique values list.
    '''
    unique_set = set()
    for row in df[column]:
        unique_set.update(row)
    return list(unique_set) 


In [885]:
def null_temporary_lst(df, column):
    '''
    Iterates over a clolumn to convert all Null values to empty list value in order to let further iterative operations.
    '''
    null_rows = df[column].isnull()
    df.loc[null_rows, column] = df.loc[null_rows, column].apply(lambda x : [])

In [886]:
null_temporary_lst(scout, '\nComfort & Convenience\n')

In [890]:
unique_value_list(scout, '\nComfort & Convenience\n')

['Electric tailgate',
 'Hill Holder',
 'Start-stop system',
 'Split rear seats',
 'Keyless central door lock',
 'Park Distance Control',
 'Electrically heated windshield',
 'Seat ventilation',
 'Panorama roof',
 'Auxiliary heating',
 'Parking assist system camera',
 'Leather seats',
 'Seat heating',
 'Massage seats',
 'Parking assist system self-steering',
 'Navigation system',
 'Heated steering wheel',
 'Tinted windows',
 'Windshield',
 'Sunroof',
 'Power windows',
 'Heads-up display',
 'Armrest',
 'Lumbar support',
 'Cruise control',
 'Parking assist system sensors front',
 'Air suspension',
 'Parking assist system sensors rear',
 'Leather steering wheel',
 'Air conditioning',
 'Electrical side mirrors',
 'Wind deflector',
 'Light sensor',
 'Electric Starter',
 'Electrically adjustable seats',
 'Automatic climate control',
 'Rain sensor',
 'Multi-function steering wheel']

In [891]:
scout['\nComfort & Convenience\n'].isnull().sum()

0

In [893]:
comfort_str_lst = [','.join(i).strip() if isinstance(i, list) else i for i in scout['\nComfort & Convenience\n']]

In [894]:
scout['comfort_convenience'] = pd.Series(comfort_str_lst)

In [895]:
scout.comfort_convenience.value_counts(dropna = False)

                                                                                                                                                                                                                                                                                                                                                             920
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                           216
Air conditioning,Electrical side mirrors,Power windows                                                                                                                                                                                                                                                

In [898]:
nulls_back = [np.nan if scout.comfort_convenience.iloc[i] == '' else scout.comfort_convenience.iloc[i] for i in range(len(scout.comfort_convenience))]

In [899]:
scout.comfort_convenience = pd.Series(nulls_back)

In [900]:
scout.comfort_convenience.value_counts(dropna = False)

NaN                                                                                                                                                                                                                                                                                                                                                                                                                                             920
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                                                                                                              216
Air conditioning,Electrical side mirrors,Power windows                                                                          

In [919]:
def fillna_group_by(df, group, column, fill_methode):
    '''
    Fills null values of a column regarding its designated group and fill methode.
    '''
    for grp in list(df[group].unique()) :
        grp_index = list(df[df[group] == grp][column].index)
        if fill_methode == 'mode':
            df[column].iloc[grp_index] = df[column].iloc[grp_index].fillna(df[df[group] == grp][column].mode()[0])
        if fill_methode == 'mean':
            df[column].iloc[grp_index] = df[column].iloc[grp_index].fillna(df[df[group] == grp][column].mean()[0])

In [920]:
fillna_group_by(scout, 'make_model', 'comfort_convenience', 'mode')

In [921]:
scout.comfort_convenience.isnull().sum()

0

In [922]:
scout.comfort_convenience.str.get_dummies(sep = ',')

Unnamed: 0,Air conditioning,Air suspension,Armrest,Automatic climate control,Auxiliary heating,Cruise control,Electric Starter,Electric tailgate,Electrical side mirrors,Electrically adjustable seats,Electrically heated windshield,Heads-up display,Heated steering wheel,Hill Holder,Keyless central door lock,Leather seats,Leather steering wheel,Light sensor,Lumbar support,Massage seats,Multi-function steering wheel,Navigation system,Panorama roof,Park Distance Control,Parking assist system camera,Parking assist system self-steering,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Seat ventilation,Split rear seats,Start-stop system,Sunroof,Tinted windows,Wind deflector,Windshield
0,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0
2,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0
3,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0
4,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,1,0,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,0,1,0,0
15915,1,0,0,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0
15916,1,0,1,1,0,1,0,1,1,0,1,1,1,0,0,0,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,0,0,0
15917,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [924]:
scout = scout.join(scout.comfort_convenience.str.get_dummies(sep = ',').add_prefix('conf_conv_'))

In [925]:
scout.drop(columns = ['\nComfort & Convenience\n'], inplace = True)

In [926]:
scout.head(1)

Unnamed: 0,make_model,body_type,price,vat,km,registration,hp,Type,Make,Model,Upholstery,Body,Displacement,Cylinders,\nEntertainment & Media\n,\nExtras\n,\nSafety & Security\n,description,Gears,Paint_Type,Previous_Owners,drive_chain,country,emission_label,inspection_new,next_inspection,body_color,door_number,seat_number,gear_type,fuel_new,consumption_comb,consumption_city,consumption_country,first_registration,CO2_Emission,emission_class,comfort_convenience,conf_conv_Air conditioning,conf_conv_Air suspension,conf_conv_Armrest,conf_conv_Automatic climate control,conf_conv_Auxiliary heating,conf_conv_Cruise control,conf_conv_Electric Starter,conf_conv_Electric tailgate,conf_conv_Electrical side mirrors,conf_conv_Electrically adjustable seats,conf_conv_Electrically heated windshield,conf_conv_Heads-up display,conf_conv_Heated steering wheel,conf_conv_Hill Holder,conf_conv_Keyless central door lock,conf_conv_Leather seats,conf_conv_Leather steering wheel,conf_conv_Light sensor,conf_conv_Lumbar support,conf_conv_Massage seats,conf_conv_Multi-function steering wheel,conf_conv_Navigation system,conf_conv_Panorama roof,conf_conv_Park Distance Control,conf_conv_Parking assist system camera,conf_conv_Parking assist system self-steering,conf_conv_Parking assist system sensors front,conf_conv_Parking assist system sensors rear,conf_conv_Power windows,conf_conv_Rain sensor,conf_conv_Seat heating,conf_conv_Seat ventilation,conf_conv_Split rear seats,conf_conv_Start-stop system,conf_conv_Sunroof,conf_conv_Tinted windows,conf_conv_Wind deflector,conf_conv_Windshield
0,Audi A1,Sedans,15770,VAT deductible,"56,013 km",01/2016,66 kW,Used,Audi,A1,"Cloth, Black",Sedans,1422,3,"[Bluetooth, Hands-free equipment, On-board computer, Radio]","[Alloy wheels, Catalytic Converter, Voice Control]","[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,Metallic,2,front,,,1,2021-06-01,Black,5,5,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,"Air conditioning,Armrest,Automatic climate control,Cruise control,Electrical side mirrors,Hill Holder,Leather steering wheel,Light sensor,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Start-stop system",1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0


In [927]:
scout.drop(columns = ['comfort_convenience'], inplace = True)

## scout['\nEntertainment & Media\n']

In [928]:
scout['\nEntertainment & Media\n'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                      1374
[Bluetooth, Hands-free equipment, On-board computer, Radio, USB]                         1282
[Bluetooth, Hands-free equipment, MP3, On-board computer, Radio, USB]                     982
[Bluetooth, CD player, Hands-free equipment, MP3, On-board computer, Radio, USB]          783
[On-board computer, Radio]                                                                487
                                                                                         ... 
[Bluetooth, MP3, On-board computer, Sound system, USB]                                      1
[CD player, Hands-free equipment, On-board computer, Radio, Sound system, Television]       1
[Bluetooth, Digital radio, On-board computer, Sound system, USB]                            1
[CD player, Hands-free equipment, MP3, Radio, USB]                                          1
[CD player, Digital radio, Radio]                           

In [929]:
null_temporary_lst(scout, '\nEntertainment & Media\n')

In [930]:
unique_value_list(scout, '\nEntertainment & Media\n')

['Sound system',
 'MP3',
 'Bluetooth',
 'Radio',
 'CD player',
 'Digital radio',
 'USB',
 'On-board computer',
 'Hands-free equipment',
 'Television']

In [931]:
scout['\nEntertainment & Media\n'].isnull().sum()

0

In [932]:
entert_str_lst = [','.join(i).strip() if isinstance(i, list) else i for i in scout['\nEntertainment & Media\n']]

In [933]:
scout['entertainment_media'] = pd.Series(entert_str_lst)

In [934]:
scout.entertainment_media.value_counts(dropna = False)

                                                                            1374
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                  1282
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB               982
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB     783
On-board computer,Radio                                                      487
                                                                            ... 
CD player,Digital radio,Hands-free equipment,MP3,On-board computer,Radio       1
CD player,Hands-free equipment,MP3,On-board computer,Radio,Sound system        1
Bluetooth,Sound system,USB                                                     1
Digital radio,Sound system,USB                                                 1
Bluetooth,CD player,Digital radio,On-board computer,Radio                      1
Name: entertainment_media, Length: 347, dtype: int64

In [935]:
nulls_back = [np.nan if scout.entertainment_media.iloc[i] == '' else scout.entertainment_media.iloc[i] for i in range(len(scout.entertainment_media))]

In [936]:
scout.entertainment_media = pd.Series(nulls_back)

In [937]:
scout.entertainment_media.value_counts(dropna = False)

NaN                                                                         1374
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                  1282
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB               982
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB     783
On-board computer,Radio                                                      487
                                                                            ... 
CD player,Digital radio,Hands-free equipment,MP3,On-board computer,Radio       1
CD player,Hands-free equipment,MP3,On-board computer,Radio,Sound system        1
Bluetooth,Sound system,USB                                                     1
Digital radio,Sound system,USB                                                 1
Bluetooth,CD player,Digital radio,On-board computer,Radio                      1
Name: entertainment_media, Length: 347, dtype: int64

In [938]:
fillna_group_by(scout, 'make_model', 'entertainment_media', 'mode')

In [939]:
scout.entertainment_media.isnull().sum()

0

In [940]:
scout.entertainment_media.str.get_dummies(sep = ',')

Unnamed: 0,Bluetooth,CD player,Digital radio,Hands-free equipment,MP3,On-board computer,Radio,Sound system,Television,USB
0,1,0,0,1,0,1,1,0,0,0
1,1,0,0,1,0,1,1,1,0,0
2,0,0,0,0,1,1,0,0,0,0
3,1,1,0,1,1,1,1,1,0,1
4,1,1,0,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
15914,1,0,1,1,0,1,1,0,0,1
15915,1,0,1,1,0,0,1,0,0,1
15916,1,0,0,1,0,1,1,0,0,0
15917,1,0,1,0,0,0,1,0,0,1


In [941]:
scout = scout.join(scout.entertainment_media.str.get_dummies(sep = ',').add_prefix('ent_media_'))

In [942]:
scout.drop(columns = ['\nEntertainment & Media\n', 'entertainment_media'], inplace = True)

In [943]:
scout.head(1)

Unnamed: 0,make_model,body_type,price,vat,km,registration,hp,Type,Make,Model,Upholstery,Body,Displacement,Cylinders,\nExtras\n,\nSafety & Security\n,description,Gears,Paint_Type,Previous_Owners,drive_chain,country,emission_label,inspection_new,next_inspection,body_color,door_number,seat_number,gear_type,fuel_new,consumption_comb,consumption_city,consumption_country,first_registration,CO2_Emission,emission_class,conf_conv_Air conditioning,conf_conv_Air suspension,conf_conv_Armrest,conf_conv_Automatic climate control,conf_conv_Auxiliary heating,conf_conv_Cruise control,conf_conv_Electric Starter,conf_conv_Electric tailgate,conf_conv_Electrical side mirrors,conf_conv_Electrically adjustable seats,conf_conv_Electrically heated windshield,conf_conv_Heads-up display,conf_conv_Heated steering wheel,conf_conv_Hill Holder,conf_conv_Keyless central door lock,conf_conv_Leather seats,conf_conv_Leather steering wheel,conf_conv_Light sensor,conf_conv_Lumbar support,conf_conv_Massage seats,conf_conv_Multi-function steering wheel,conf_conv_Navigation system,conf_conv_Panorama roof,conf_conv_Park Distance Control,conf_conv_Parking assist system camera,conf_conv_Parking assist system self-steering,conf_conv_Parking assist system sensors front,conf_conv_Parking assist system sensors rear,conf_conv_Power windows,conf_conv_Rain sensor,conf_conv_Seat heating,conf_conv_Seat ventilation,conf_conv_Split rear seats,conf_conv_Start-stop system,conf_conv_Sunroof,conf_conv_Tinted windows,conf_conv_Wind deflector,conf_conv_Windshield,ent_media_Bluetooth,ent_media_CD player,ent_media_Digital radio,ent_media_Hands-free equipment,ent_media_MP3,ent_media_On-board computer,ent_media_Radio,ent_media_Sound system,ent_media_Television,ent_media_USB
0,Audi A1,Sedans,15770,VAT deductible,"56,013 km",01/2016,66 kW,Used,Audi,A1,"Cloth, Black",Sedans,1422,3,"[Alloy wheels, Catalytic Converter, Voice Control]","[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,Metallic,2,front,,,1,2021-06-01,Black,5,5,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0


## scout['\nExtras\n']

In [944]:
scout['\nExtras\n'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[Alloy wheels]                                                                                         3245
NaN                                                                                                    2962
[Alloy wheels, Touch screen]                                                                            697
[Alloy wheels, Voice Control]                                                                           577
[Alloy wheels, Touch screen, Voice Control]                                                             541
                                                                                                       ... 
[Alloy wheels, Sport suspension, Touch screen, Trailer hitch, Voice Control]                              1
[Alloy wheels, Catalytic Converter, Handicapped enabled, Touch screen, Voice Control, Winter tyres]       1
[Ski bag, Sport suspension]                                                                               1
[Alloy wheels, Sport package

In [946]:
null_temporary_lst(scout, '\nExtras\n')

In [947]:
unique_value_list(scout, '\nExtras\n')

['Winter tyres',
 'Alloy wheels',
 'Voice Control',
 'Roof rack',
 'Ski bag',
 'Shift paddles',
 'Handicapped enabled',
 'Trailer hitch',
 'Right hand drive',
 'Catalytic Converter',
 'Cab or rented Car',
 'Touch screen',
 'Sport suspension',
 'Sport package',
 'Tuned car',
 'Sliding door',
 'Sport seats']

In [948]:
scout['\nExtras\n'].isnull().sum()

0

In [949]:
comfort_str_lst = [','.join(i).strip() if isinstance(i, list) else i for i in scout['\nExtras\n']]

In [950]:
scout['Extras'] = pd.Series(comfort_str_lst)

In [951]:
scout.Extras.value_counts(dropna = False)

Alloy wheels                                                                                        3245
                                                                                                    2962
Alloy wheels,Touch screen                                                                            697
Alloy wheels,Voice Control                                                                           577
Alloy wheels,Touch screen,Voice Control                                                              541
                                                                                                    ... 
Alloy wheels,Catalytic Converter,Handicapped enabled,Shift paddles,Touch screen                        1
Alloy wheels,Shift paddles,Sport package,Sport suspension,Winter tyres                                 1
Alloy wheels,Shift paddles,Touch screen,Trailer hitch                                                  1
Alloy wheels,Shift paddles,Sport seats,Sport suspension

In [952]:
nulls_back = [np.nan if scout.Extras.iloc[i] == '' else scout.Extras.iloc[i] for i in range(len(scout.Extras))]

In [953]:
scout.Extras = pd.Series(nulls_back)

In [954]:
scout.Extras.value_counts(dropna = False)

Alloy wheels                                                                                        3245
NaN                                                                                                 2962
Alloy wheels,Touch screen                                                                            697
Alloy wheels,Voice Control                                                                           577
Alloy wheels,Touch screen,Voice Control                                                              541
                                                                                                    ... 
Alloy wheels,Catalytic Converter,Handicapped enabled,Shift paddles,Touch screen                        1
Alloy wheels,Shift paddles,Sport package,Sport suspension,Winter tyres                                 1
Alloy wheels,Shift paddles,Touch screen,Trailer hitch                                                  1
Alloy wheels,Shift paddles,Sport seats,Sport suspension

In [955]:
fillna_group_by(scout, 'make_model', 'Extras', 'mode')

In [956]:
scout.Extras.isnull().sum()

0

In [957]:
scout.Extras.str.get_dummies(sep = ',')

Unnamed: 0,Alloy wheels,Cab or rented Car,Catalytic Converter,Handicapped enabled,Right hand drive,Roof rack,Shift paddles,Ski bag,Sliding door,Sport package,Sport seats,Sport suspension,Touch screen,Trailer hitch,Tuned car,Voice Control,Winter tyres
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
15915,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
15916,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15917,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [958]:
scout = scout.join(scout.Extras.str.get_dummies(sep = ',').add_prefix('extra_'))

In [959]:
scout.drop(columns = ['\nExtras\n', 'Extras'], inplace = True)

In [960]:
scout.head(1)

Unnamed: 0,make_model,body_type,price,vat,km,registration,hp,Type,Make,Model,Upholstery,Body,Displacement,Cylinders,\nSafety & Security\n,description,Gears,Paint_Type,Previous_Owners,drive_chain,country,emission_label,inspection_new,next_inspection,body_color,door_number,seat_number,gear_type,fuel_new,consumption_comb,consumption_city,consumption_country,first_registration,CO2_Emission,emission_class,conf_conv_Air conditioning,conf_conv_Air suspension,conf_conv_Armrest,conf_conv_Automatic climate control,conf_conv_Auxiliary heating,conf_conv_Cruise control,conf_conv_Electric Starter,conf_conv_Electric tailgate,conf_conv_Electrical side mirrors,conf_conv_Electrically adjustable seats,conf_conv_Electrically heated windshield,conf_conv_Heads-up display,conf_conv_Heated steering wheel,conf_conv_Hill Holder,conf_conv_Keyless central door lock,conf_conv_Leather seats,conf_conv_Leather steering wheel,conf_conv_Light sensor,conf_conv_Lumbar support,conf_conv_Massage seats,conf_conv_Multi-function steering wheel,conf_conv_Navigation system,conf_conv_Panorama roof,conf_conv_Park Distance Control,conf_conv_Parking assist system camera,conf_conv_Parking assist system self-steering,conf_conv_Parking assist system sensors front,conf_conv_Parking assist system sensors rear,conf_conv_Power windows,conf_conv_Rain sensor,conf_conv_Seat heating,conf_conv_Seat ventilation,conf_conv_Split rear seats,conf_conv_Start-stop system,conf_conv_Sunroof,conf_conv_Tinted windows,conf_conv_Wind deflector,conf_conv_Windshield,ent_media_Bluetooth,ent_media_CD player,ent_media_Digital radio,ent_media_Hands-free equipment,ent_media_MP3,ent_media_On-board computer,ent_media_Radio,ent_media_Sound system,ent_media_Television,ent_media_USB,extra_Alloy wheels,extra_Cab or rented Car,extra_Catalytic Converter,extra_Handicapped enabled,extra_Right hand drive,extra_Roof rack,extra_Shift paddles,extra_Ski bag,extra_Sliding door,extra_Sport package,extra_Sport seats,extra_Sport suspension,extra_Touch screen,extra_Trailer hitch,extra_Tuned car,extra_Voice Control,extra_Winter tyres
0,Audi A1,Sedans,15770,VAT deductible,"56,013 km",01/2016,66 kW,Used,Audi,A1,"Cloth, Black",Sedans,1422,3,"[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control, Xenon headlights]","[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,Metallic,2,front,,,1,2021-06-01,Black,5,5,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0


## scout['\nSafety & Security\n']

In [961]:
scout['\nSafety & Security\n'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


NaN                                                                                                                                                                                                                                                                                                                                                                                        982
[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Fog lights, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control]                                                                                                                                                 538
[ABS, Central door lock, Daytime running lights, Driver-side airbag, Electronic stability control, Immobilizer, Isofix, Passenger-side airbag, Power steering, Side airbag, Tire pressure monitoring system, Traction control]            

In [962]:
safe_str_lst = [','.join(i).strip() if isinstance(i, list) else i for i in scout['\nSafety & Security\n']]

In [963]:
scout['safety_security'] = pd.Series(safe_str_lst)

In [964]:
scout.safety_security.value_counts(dropna = False)

NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        982
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                                                                                                                                                               538
ABS,Centra

In [965]:
nulls_back = [np.nan if scout.safety_security.iloc[i] == '' else scout.safety_security.iloc[i] for i in range(len(scout.safety_security))]

In [966]:
scout.safety_security = pd.Series(nulls_back)

In [967]:
scout.safety_security.value_counts(dropna = False)

NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        982
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                                                                                                                                                                                                               538
ABS,Centra

In [968]:
fillna_group_by(scout, 'make_model', 'safety_security', 'mode')

In [969]:
scout.safety_security.isnull().sum()

0

In [970]:
scout.safety_security.str.get_dummies(sep = ',')

Unnamed: 0,ABS,Adaptive Cruise Control,Adaptive headlights,Alarm system,Blind spot monitor,Central door lock,Central door lock with remote control,Daytime running lights,Driver drowsiness detection,Driver-side airbag,Electronic stability control,Emergency brake assistant,Emergency system,Fog lights,Head airbag,Immobilizer,Isofix,LED Daytime Running Lights,LED Headlights,Lane departure warning system,Night view assist,Passenger-side airbag,Power steering,Rear airbag,Side airbag,Tire pressure monitoring system,Traction control,Traffic sign recognition,Xenon headlights
0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1
1,1,0,0,0,0,1,1,1,0,1,1,0,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,0,1
2,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0
3,1,0,0,1,0,0,1,0,1,1,1,0,1,0,1,1,1,0,0,0,0,1,1,0,1,1,0,0,0
4,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,1,0,0,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0
15915,1,1,0,0,1,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,0,1,1,0,1,1,1,1,0
15916,1,1,0,0,1,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,0,1,1,0,1,0,1,1,0
15917,1,0,0,0,1,0,0,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0


In [971]:
scout = scout.join(scout.safety_security.str.get_dummies(sep = ',').add_prefix('safety_'))

In [972]:
scout.drop(columns = ['\nSafety & Security\n', 'safety_security'], inplace = True)

In [973]:
scout.head(1)

Unnamed: 0,make_model,body_type,price,vat,km,registration,hp,Type,Make,Model,Upholstery,Body,Displacement,Cylinders,description,Gears,Paint_Type,Previous_Owners,drive_chain,country,emission_label,inspection_new,next_inspection,body_color,door_number,seat_number,gear_type,fuel_new,consumption_comb,consumption_city,consumption_country,first_registration,CO2_Emission,emission_class,conf_conv_Air conditioning,conf_conv_Air suspension,conf_conv_Armrest,conf_conv_Automatic climate control,conf_conv_Auxiliary heating,conf_conv_Cruise control,conf_conv_Electric Starter,conf_conv_Electric tailgate,conf_conv_Electrical side mirrors,conf_conv_Electrically adjustable seats,conf_conv_Electrically heated windshield,conf_conv_Heads-up display,conf_conv_Heated steering wheel,conf_conv_Hill Holder,conf_conv_Keyless central door lock,conf_conv_Leather seats,conf_conv_Leather steering wheel,conf_conv_Light sensor,conf_conv_Lumbar support,conf_conv_Massage seats,conf_conv_Multi-function steering wheel,conf_conv_Navigation system,conf_conv_Panorama roof,conf_conv_Park Distance Control,conf_conv_Parking assist system camera,conf_conv_Parking assist system self-steering,conf_conv_Parking assist system sensors front,conf_conv_Parking assist system sensors rear,conf_conv_Power windows,conf_conv_Rain sensor,conf_conv_Seat heating,conf_conv_Seat ventilation,conf_conv_Split rear seats,conf_conv_Start-stop system,conf_conv_Sunroof,conf_conv_Tinted windows,conf_conv_Wind deflector,conf_conv_Windshield,ent_media_Bluetooth,ent_media_CD player,ent_media_Digital radio,ent_media_Hands-free equipment,ent_media_MP3,ent_media_On-board computer,ent_media_Radio,ent_media_Sound system,ent_media_Television,ent_media_USB,extra_Alloy wheels,extra_Cab or rented Car,extra_Catalytic Converter,extra_Handicapped enabled,extra_Right hand drive,extra_Roof rack,extra_Shift paddles,extra_Ski bag,extra_Sliding door,extra_Sport package,extra_Sport seats,extra_Sport suspension,extra_Touch screen,extra_Trailer hitch,extra_Tuned car,extra_Voice Control,extra_Winter tyres,safety_ABS,safety_Adaptive Cruise Control,safety_Adaptive headlights,safety_Alarm system,safety_Blind spot monitor,safety_Central door lock,safety_Central door lock with remote control,safety_Daytime running lights,safety_Driver drowsiness detection,safety_Driver-side airbag,safety_Electronic stability control,safety_Emergency brake assistant,safety_Emergency system,safety_Fog lights,safety_Head airbag,safety_Immobilizer,safety_Isofix,safety_LED Daytime Running Lights,safety_LED Headlights,safety_Lane departure warning system,safety_Night view assist,safety_Passenger-side airbag,safety_Power steering,safety_Rear airbag,safety_Side airbag,safety_Tire pressure monitoring system,safety_Traction control,safety_Traffic sign recognition,safety_Xenon headlights
0,Audi A1,Sedans,15770,VAT deductible,"56,013 km",01/2016,66 kW,Used,Audi,A1,"Cloth, Black",Sedans,1422,3,"[\n, Sicherheit:, , Deaktivierung für Beifahrer-Airbag, , ESC mit elektronischer Quersperre, , Tagfahrlicht, , Reifendruck-Kontrollanzeige, , Kopfairbag-System mit Seiten-Airbags vorn, , Sicherheitslenksäule, Assistenzsysteme:, , Berganfahrassistent, Komfort:, , Scheinwerferreinigung, , Xenon plus inklusive Scheinwerfer-Reinigungsanlage, , Scheinwerfer-Reinigungsanlage, , Einparkhilfe hinten, , Licht-/Regensensor, , Funkfernbedienung, , Elektrische Luftzusatzheizung, Interieur:...",,Metallic,2,front,,,1,2021-06-01,Black,5,5,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1


## scout['description']

In [975]:
scout['description'].value_counts(dropna = False)

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 1652, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

This column includes a great deal of unordered data. Various machine learning techniques may be considered to be used in thos column. On the other hand due to the project scope and goals we will drop 'description' column.

In [977]:
scout.drop(columns = ['description'], inplace = True)

In [978]:
scout.head()

Unnamed: 0,make_model,body_type,price,vat,km,registration,hp,Type,Make,Model,Upholstery,Body,Displacement,Cylinders,Gears,Paint_Type,Previous_Owners,drive_chain,country,emission_label,inspection_new,next_inspection,body_color,door_number,seat_number,gear_type,fuel_new,consumption_comb,consumption_city,consumption_country,first_registration,CO2_Emission,emission_class,conf_conv_Air conditioning,conf_conv_Air suspension,conf_conv_Armrest,conf_conv_Automatic climate control,conf_conv_Auxiliary heating,conf_conv_Cruise control,conf_conv_Electric Starter,conf_conv_Electric tailgate,conf_conv_Electrical side mirrors,conf_conv_Electrically adjustable seats,conf_conv_Electrically heated windshield,conf_conv_Heads-up display,conf_conv_Heated steering wheel,conf_conv_Hill Holder,conf_conv_Keyless central door lock,conf_conv_Leather seats,conf_conv_Leather steering wheel,conf_conv_Light sensor,conf_conv_Lumbar support,conf_conv_Massage seats,conf_conv_Multi-function steering wheel,conf_conv_Navigation system,conf_conv_Panorama roof,conf_conv_Park Distance Control,conf_conv_Parking assist system camera,conf_conv_Parking assist system self-steering,conf_conv_Parking assist system sensors front,conf_conv_Parking assist system sensors rear,conf_conv_Power windows,conf_conv_Rain sensor,conf_conv_Seat heating,conf_conv_Seat ventilation,conf_conv_Split rear seats,conf_conv_Start-stop system,conf_conv_Sunroof,conf_conv_Tinted windows,conf_conv_Wind deflector,conf_conv_Windshield,ent_media_Bluetooth,ent_media_CD player,ent_media_Digital radio,ent_media_Hands-free equipment,ent_media_MP3,ent_media_On-board computer,ent_media_Radio,ent_media_Sound system,ent_media_Television,ent_media_USB,extra_Alloy wheels,extra_Cab or rented Car,extra_Catalytic Converter,extra_Handicapped enabled,extra_Right hand drive,extra_Roof rack,extra_Shift paddles,extra_Ski bag,extra_Sliding door,extra_Sport package,extra_Sport seats,extra_Sport suspension,extra_Touch screen,extra_Trailer hitch,extra_Tuned car,extra_Voice Control,extra_Winter tyres,safety_ABS,safety_Adaptive Cruise Control,safety_Adaptive headlights,safety_Alarm system,safety_Blind spot monitor,safety_Central door lock,safety_Central door lock with remote control,safety_Daytime running lights,safety_Driver drowsiness detection,safety_Driver-side airbag,safety_Electronic stability control,safety_Emergency brake assistant,safety_Emergency system,safety_Fog lights,safety_Head airbag,safety_Immobilizer,safety_Isofix,safety_LED Daytime Running Lights,safety_LED Headlights,safety_Lane departure warning system,safety_Night view assist,safety_Passenger-side airbag,safety_Power steering,safety_Rear airbag,safety_Side airbag,safety_Tire pressure monitoring system,safety_Traction control,safety_Traffic sign recognition,safety_Xenon headlights
0,Audi A1,Sedans,15770,VAT deductible,"56,013 km",01/2016,66 kW,Used,Audi,A1,"Cloth, Black",Sedans,1422,3.0,,Metallic,2.0,front,,,1,2021-06-01,Black,5,5,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1
1,Audi A1,Sedans,14500,Price negotiable,"80,000 km",03/2017,141 kW,Used,Audi,A1,"Cloth, Grey",Sedans,1798,4.0,[\n7\n],,,front,,4 (Green),0,NaT,Red,3,4,Automatic,Benzine,5.6,5.6,5.6,2017,129,Euro 6,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,1,1,0,1,1,0,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,0,1
2,Audi A1,Sedans,14640,VAT deductible,"83,450 km",02/2016,85 kW,Used,Audi,A1,"Cloth, Black",Sedans,1598,,,Metallic,1.0,front,,4 (Green),0,NaT,Black,4,4,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0
3,Audi A1,Sedans,14500,,"73,000 km",08/2016,66 kW,Used,Audi,A1,,Sedans,1422,3.0,[\n6\n],Metallic,1.0,,,,0,NaT,Brown,3,4,Automatic,Diesel,3.8,3.8,3.8,2016,99,Euro 6,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,1,1,0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,1,0,1,1,1,0,0,0,0,1,1,0,1,1,0,0,0
4,Audi A1,Sedans,16790,,"16,200 km",05/2016,66 kW,Used,Audi,A1,"Cloth, Black",Sedans,1422,3.0,,Metallic,1.0,front,Germany,,1,NaT,Black,5,5,Automatic,Diesel,4.1,4.1,4.1,2016,109,Euro 6,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,1,1,1,0,1


In [979]:
scout.shape

(15919, 127)

In [1021]:
scout.to_csv('scout_cleaned_for_step_2.csv')