In [1]:
#general imports that we will need will almost always use - it is a good practice to import all libraries at the beginning of the notebook or script
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
sns.set()

# data partition
from sklearn.model_selection import train_test_split

#filter methods
# spearman 
# chi-square
import scipy.stats as stats
from scipy.stats import chi2_contingency

#wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFE


# embedded methods
from sklearn.linear_model import LassoCV

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

#set random seed for reproducibility
RSEED = 42
np.random.seed(RSEED)



In [2]:
# path to folder
base_path = r"C:/Users/maria/OneDrive - ISEG/Documents/MESTRADO/Machine Learning/Projeto"
train_path = os.path.join(base_path, "test.csv")
df_test = pd.read_csv(train_path, delimiter=',', header=0, decimal='.', quotechar='"')
df_test

Unnamed: 0,carID,Brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
0,89856,Hyundai,I30,2022.878006,Automatic,30700.000000,petrol,205.0,41.5,1.6,61.0,3.0,0.0
1,106581,VW,Tiguan,2017.000000,Semi-Auto,-48190.655673,Petrol,150.0,38.2,2.0,60.0,2.0,0.0
2,80886,BMW,2 Series,2016.000000,Automatic,36792.000000,Petrol,125.0,51.4,1.5,94.0,2.0,0.0
3,100174,Opel,Grandland X,2019.000000,Manual,5533.000000,Petrol,145.0,44.1,1.2,77.0,1.0,0.0
4,81376,BMW,1 Series,2019.000000,Semi-Auto,9058.000000,Diesel,150.0,51.4,2.0,45.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32562,105775,VW,Tiguan,2017.000000,Manual,27575.000000,Petrol,145.0,46.3,1.4,94.0,1.0,0.0
32563,81363,BMW,X2,2020.000000,Automatic,1980.000000,Petrol,145.0,34.0,2.0,39.0,3.0,0.0
32564,76833,Audi,Q5,2019.000000,Semi-Auto,8297.000000,Diesel,145.0,38.2,2.0,88.0,4.0,0.0
32565,91768,Mercedes,A Class,2019.000000,Manual,-50755.210230,Petrol,145.0,28.5,1.3,81.0,1.0,0.0


In [4]:
for i in df_test.columns:
    print(f"{i}: {df_test[i].unique()}")

carID: [ 89856 106581  80886 ...  76833  91768  99627]
Brand: ['Hyundai' 'VW' 'BMW' 'Opel' 'Ford' 'Mercedes' 'Skoda' 'Toyot' 'Toyota'
 'Audi' nan 'For' 'Ope' 'toyota' 'vw' 'hyundai' 'MW' 'SKODA' 'ord' 'udi'
 'bmw' 'V' 'BM' 'HYUNDAI' 'OPEL' 'mercedes' 'audi' 'Mercede' 'pel' 'opel'
 'FORD' 'yundai' 'ford' 'Aud' 'oyota' 'MERCEDES' 'ercedes' 'AUDI' 'koda'
 'Hyunda' 'W' 'skoda' 'Skod' 'ercede' 'TOYOTA' 'ERCEDES' 'kod' 'ORD' 'v'
 'ud' 'M' 'FOR' 'for' 'MERCEDE' 'YUNDAI' 'PEL' 'ope' 'or' 'TOYOT' 'hyunda'
 'oyot' 'UDI' 'mw' 'pe' 'bm']
model: [' I30' ' Tiguan' ' 2 Series' ' Grandland X' '1 Series' ' Fiesta' ' X1'
 ' B Class' ' Focus' ' Superb' ' 5 Series' ' C Class' ' Up' ' Aygo' 'Golf'
 ' M CLAS' ' Land Cruiser' ' TT' ' Adam' ' Zafira' ' E Class' ' Golf'
 ' 3 Series' ' IX20' ' A4' ' Yaris' ' Passat' ' I10' ' Mokka X'
 ' EcoSport' ' 1 Series' ' 4 Series' ' A7' ' Corsa' ' Kuga' ' Grand C-MAX'
 ' Q2' ' M4' ' A Class' ' RAV4' ' Fabia' ' Insignia' ' A1' ' X6' ' Meriva'
 ' Caravelle' ' Octavia' ' Aur

In [7]:
list_bmw = ["BMW", "MW", "bmw", "BM", "mw","bm","M"]
list_audi = ["Audi", "udi", "AUDI", "audi", "Aud", "aud", "UDI", "ud", "AUD"]
list_mercedes = ["Mercedes", "mercedes", "Mercede", "MERCEDES", "ercedes", "mercede", "ERCEDES", "ercede", "MERCEDE"]
list_ford = ["Ford", "FOR", "For", "FORD", "ford", "for", "or", "ORD", "ord"]
list_toyota = ["Toyota", "Toyot", "TOYOTA", "oyota", "toyota", "OYOTA", "TOYOT", "toyot","oyot"]
list_opel = ["Opel", "Ope", "opel", "pel", "pe", "OPEL", "PEL", "OPE", "ope"]
list_skoda = ["Skoda", "koda", "skoda", "SKODA", "Skod", "kod", "SKOD", "KODA", "skod"]
list_hyundai = ["Hyundai", "yundai", "Hyunda", "hyundai", "HYUNDAI", "yunda", "HYUNDA", "YUNDA", "yunda", "hyunda","YUNDAI"]
list_vw = ["VW", "V", "v","vw", "w", "W"]

def clean_brand(brand):
   if brand in list_bmw:
      return "BMW"
   elif brand in list_audi:
      return "Audi"
   elif brand in list_mercedes:
      return "Mercedes"
   elif brand in list_ford:
      return "Ford"
   elif brand in list_toyota:
      return "Toyota"
   elif brand in list_opel:
      return "Opel"
   elif brand in list_skoda:
      return "Skoda"
   elif brand in list_hyundai:
      return "Hyundai"
   elif brand in list_vw:
      return "VW"
   else:
      return brand
   
df_test['Brand'] = df_test['Brand'].apply(clean_brand)


In [8]:
df_test["Brand"].unique()

array(['Hyundai', 'VW', 'BMW', 'Opel', 'Ford', 'Mercedes', 'Skoda',
       'Toyota', 'Audi', nan], dtype=object)

In [12]:
diesel_types=["DIESEL","Diese","diesel","iesel","IESEL","DIESE","iese","diese","IESE"]
petrol_types=["petrol","Petro","etrol", "PETROL","petro","etro","ETROL","PETRO","ETRO"]
hybrid_types=["HYBRID", "hybrid","Hybri","ybrid","HYBRI","ybri","YBRID","hybri"]
other=["ther","Othe","OTHER","other"]
df_test["fuelType"]=df_test["fuelType"].apply(lambda i: "Diesel" if i in diesel_types else i)
df_test["fuelType"]=df_test["fuelType"].apply(lambda i: "Petrol" if i in petrol_types else i)
df_test["fuelType"]=df_test["fuelType"].apply(lambda i: "Hybrid" if i in hybrid_types else i)
df_test["fuelType"]=df_test["fuelType"].apply(lambda i: "Other" if i in other else i)
#df_train["fuelType"].value_counts().plot(kind="bar")

df_test.loc[df_test["fuelType"].isin(["nan"]), "fuelType"] = np.nan

In [13]:
df_test["fuelType"].unique()

array(['Petrol', 'Diesel', 'Hybrid', nan, 'Other', 'Electric'],
      dtype=object)

In [124]:
car_models = {
    "GOLF": ['Golf', 'Gol', ' GOLF', ' golf', ' golf s', ' Gol', ' Golf SV', ' golf sv', 'Golf', 'golf', ' GOLF', ' Golf S', ' gol', ' GOL', ' Golf'],
    "POLO": ['Polo', 'Pol', 'polo', ' POLO', ' polo', ' Pol', ' Polo', 'POLO', ' POL'],
    "PASSAT": ['Passat', ' PASSAT', ' Passa', ' passa', ' Passa', ' Passa', ' passat', ' Passat', ' PASSA'],
    "TIGUAN": ['Tiguan', ' Tiguan', ' TIGUAN', 'Tigua', ' tiguan', ' Tigua', ' TIGUA', ' tiguan', ' Tiguan', 'TIGUAN', 'tiguan'],
    "TIGUAN_ALLSPACE": [' tiguan allspace', ' Tiguan Allspac', 'Tiguan Allspace', ' TIGUAN ALLSPACE', ' Tiguan Allspace'],
    "TOUAREG": ['Touareg', ' TOUAREG', ' touareg', ' Touare', ' Touareg'],
    "T_ROC": ['T-Roc', ' T-Ro', ' t roc', ' T-Roc'],
    "T_CROSS": ['T-Cross', ' T-CROSS', ' t-cros', ' t cross', ' T-Cros', ' T-RO', ' T-ROC', ' t-roc', ' t-cross', 'T-Cross', ' T-Cross'],
    "ARTEON": ['Arteon', ' arteon', ' Arteo', ' ARTEON', ' Arteon'],
    "SHARAN": ['Sharan', ' sharan', ' SHARAN', ' Shara', ' Sharan', ' Sharan', ' SHARA'],
    "JETTA": ['Jetta', ' Jetta', ' JETTA', ' jetta'],
    "CC": ['CC', ' cc', ' CC'],
    "CARAVELLE": ['Caravelle', ' caravelle', ' Caravell', ' Caravelle'],
    "CALIFORNIA": ['California', ' California', ' Californi'],
    "CADDY": ['Caddy', ' Caddy'],
    "CADDY_MAXI": ['Caddy Maxi Life', ' Caddy Maxi Lif', ' Caddy Maxi', ' Caddy Maxi Life', ' Caddy Life'],
    "TOURAN": ['Touran', ' touran', ' TOURAN', ' Touran', 'Toura', ' TOURA', ' Toura'],
    "GOLF_SV": ['Golf SV', 'Golf S', ' golf sv', ' GOLF SV', 'Golf SV'],
    "BEETLE": ['Beetle', ' beetle', ' Beetl', ' Beetle'],
    "SCIROCCO": ['Scirocco', ' scirocco', ' Scirocc', ' SCIROCCO', ' Scirocco'],
    "UP": ['Up', ' up', ' UP', ' Up', ' U', 'U', 'UP'],
    "AMAROK": ['Amarok', ' Amarok', ' amarok', ' Amaro'],
    "EOS": ['Eos', ' Eos'],
    "FOX": ['Fox', ' Fox'], "FOCUS": ['Focus', 'FOCUS', ' focus', ' FOCUS', ' FOCU', ' Focu', ' focu', ' Focus', 'Focu', 'focus',' fox'],
    "FIESTA": ['Fiesta', ' FIESTA', ' Fiest', 'fiesta', ' fiesta', ' Fiest', ' FIEST', ' fiest', ' Fiesta', 'Fiest'],
    "MONDEO": ['Mondeo', ' MONDEO', ' mondeo', ' Monde', ' MONDE', ' Mondeo'],
    "KUGA": ['Kuga', ' KUGA', ' kuga', ' Kug', ' kuga', ' Kuga', 'kuga'],
    "GALAXY": ['Galaxy', ' GALAXY', ' Galax', ' galax', ' Galaxy', ' galaxy'],
    "S_MAX": ['S-MAX', ' s-max', ' S-MA', ' s-ma', ' S-MAX'],
    "B_MAX": ['B-MAX', ' B-MA', ' b-max', ' B-MA', ' B-MAX', 'B-MA'],
    "ECOSPORT": ['EcoSport', ' ecosport', ' ECOSPORT', ' EcoSpor', ' EcoSport'],
    "PUMA": ['Puma', ' puma', ' PUMA', ' Puma', ' Pum', 'Pum'],
    "TOURNEO_CUSTOM": ['Tourneo Custom', ' tourneo custom', ' Tourneo Custo', ' Tourneo Custom', ' Transit Tourneo'],
    "TOURNEO_CONNECT": ['Tourneo Connect', ' Grand Tourneo Connect', ' tourneo connect', ' Tourneo Connect'],
    "KA": ['Ka', ' KA', ' ka', ' Ka', ' K'],
    "KA_PLUS": [' Ka+', ' ka+', ' KA+', 'Ka+', 'ka+'],
    "C_MAX": ['C-MAX', ' C-MAX', ' c-max', ' C-MA'],
    "GRAND_C_MAX": [' Grand C-MAX', ' GRAND C-MAX', ' Grand C-MA', ' grand c-max', 'Grand C-MAX'],
    "EDGE": ['Edge', ' edge', ' EDGE', ' Edge', ' Edg'],
    "MUSTANG": ['Mustang', ' mustang', ' Mustang'],
    "GRAND_TOURNEO_CONNECT": [' Grand Tourneo Connec', 'Grand Tourneo Connect'],
    "FUSION": ['Fusion', ' Fusion', ' fusion'],
    "STREETKA": ['Streeka', ' Streetka'],
    "RANGER": ['Ranger', ' Ranger'],
    "ESCORT": ['Escort', ' Escort'], "CORSA": ['Corsa', 'corsa', ' corsa', ' Cors', ' CORSA', ' cors', ' CORS', ' Corsa'],
    "ASTRA": ['Astra', 'ASTRA', 'astra', ' astra', ' ASTRA', ' ASTRA', ' Astr', 'Astra', ' Astra'],
    "INSIGNIA": ['Insignia', 'Insigni', ' INSIGNIA', ' insignia', ' Insigni', ' INSIGNI', ' Insignia'],
    "MOKKA": ['Mokka', 'Mokk', ' MOKKA', ' Mokka X', ' mokka', ' mokka x', ' MOKKA X', ' Mokk', ' Mokk', ' Mokka ', ' mokka ', ' Mokka X', ' Mokka', 'Mokka X'],
    "ZAFIRA": ['Zafira', ' zafira', ' ZAFIRA', ' Zafir', ' Zafira Tourer', ' Zafira Toure', ' ZAFIRA TOURER', 'Zafira Tourer', ' Zafira', 'Zafir', ' zafira tourer', ' ZAFIR'],
    "VIVA": ['Viva', ' viva', ' VIVA', ' Viva', ' Viv', ' Viva', 'Viva', 'viva'],
    "MERIVA": ['Meriva', ' meriva', ' MERIVA', ' Meriv', ' Meriva'],
    "ADAM": ['Adam', ' adam', ' Ada', ' ADAM', ' Adam'],
    "COMBO_LIFE": ['Combo Life', 'COMBO LIFE', ' combo life', ' COMBO LIFE', ' Combo Lif', ' Combo Life'],
    "CROSSLAND_X": ['Crossland X', ' CROSSLAND X', ' crossland x', ' Crossland ', ' CrosslandX', ' Crossland X'],
    "GRANDLAND_X": ['Grandland X', ' Grandland X', ' grandland x', ' GRANDLAND X', ' Grandland ', ' grandland ', ' Grandland X'],
    "GTC": ['GTC', ' gtc', ' GTC', 'gtc'],
    "ANTARA": ['Antara', ' Antara', ' antara'],
    "VIVARO": ['Vivaro', ' Vivaro'],
    "VECTRA": ['Vectra', ' Vectra', ' VECTRA'],
    "AGILA": ['Agila', ' Agila', ' Amica', ' AGILA'],
    "TIGRA": ['Tigra', ' Tigra', ' tigua', 'TIGRA'],
    "CASCADA": ['Cascada', ' Cascada'],
    "AMPERA": ['Ampera', ' Ampera'], 
    "SERIES_1": ['1 Series', '1 series', '1 SERIES', ' 1 Series', ' 1 SERIES', ' 1 serie', ' 1 Serie', ' 1 SERIE', ' 1 series'],
    "SERIES_2": ['2 Series', '2 Serie', ' 2 Series', ' 2 SERIES', ' 2 Serie', ' 2 serie', ' 2 series', '2 series'],
    "SERIES_3": ['3 Series', '3 Serie', ' 3 Series', ' 3 SERIES', ' 3 Serie', ' 3 serie', ' 3 series'],
    "SERIES_4": ['4 Series', ' 4 serie', '4 series', ' 4 Series', ' 4 SERIES', ' 4 Serie', ' 4 series'],
    "SERIES_5": ['5 Series', '5 SERIES', ' 5 Series', ' 5 SERIES', ' 5 Serie', ' 5 series', ' 5 SERIE'],
    "SERIES_6": ['6 Series', ' 6 Series', ' 6 SERIES', ' 6 Serie', ' 6 series'],
    "SERIES_7": ['7 Series', ' 7 Series', ' 7 SERIES', ' 7 Serie', ' 7 series'],
    "SERIES_8": ['8 Series', ' 8 SERIES', '8 SERIES', ' 8 Serie', ' 8 Series', ' 8 series'],
    "X1": ['X1', ' x1', ' X1', 'x1', ' X'],
    "X2": ['X2', ' IX2', ' x2', ' X2'],
    "X3": ['X3', ' x3', ' X3'],
    "X4": ['X4', ' x4', ' X4'],
    "X5": ['X5', ' x5', ' X5'],
    "X6": ['X6', ' x6', ' X6'],
    "X7": ['X7', ' x7', ' X7'],
    "Z3": ['Z3', ' Z3'],
    "Z4": ['Z4', ' z4', ' Z4'],
    "M2": ['M2', ' M2'],
    "M3": ['M3', ' m3', ' M3'],
    "M4": ['M4', ' m4', ' M4'],
    "M5": ['M5', ' M5', ' m5'],
    "M6": ['M6', ' M6'], "A1": ['A1', ' a1', ' A1'],
    "A2": ['A2', ' A2'],
    "A3": ['A3', ' a3', ' A 3', ' A3', ' a3'],
    "A4": ['A4', ' a4', ' A4', ' a4'],
    "A5": ['A5', ' a5', ' A5', 'a5'],
    "A6": ['A6', ' A6', ' a6'],
    "A7": ['A7', ' a7', ' A7'],
    "A8": ['A8', ' A8', 'a8'],
    "Q2": ['Q2', ' q2', ' Q2'],
    "Q3": ['Q3', ' q3', ' Q3'],
    "Q5": ['Q5', ' q5', ' Q5', ' q5', 'q5', ' Q', 'Q'],
    "Q7": ['Q7', ' q7', ' Q7'],
    "Q8": ['Q8', ' q8', ' Q8'],
    "S3": ['S3', ' s3', ' S3'],
    "S4": ['S4', ' S4'],
    "S5": ['S5', ' S5'],
    "S8": ['S8', ' S8'],
    "RS3": ['RS3', ' RS3'],
    "RS4": ['RS4', ' RS4'],
    "RS5": ['RS5', ' RS5'],
    "RS6": ['RS6', ' RS6', ' RS7'],
    "SQ5": ['SQ5', ' SQ5'],
    "SQ7": ['SQ7', ' SQ7', ' sq7'],
    "TT": ['TT', ' TT', ' tt', ' T'],
    "R8": ['R8', ' r8', ' R8'],  
    "A_CLASS": ['A-Class', 'a class', 'A CLASS', ' A-Class', ' a class', ' A CLASS', 'A Clas', ' a clas', 'A-Class', ' A Class', 'A CLASS', 'A Clas', ' a', ' a class', ' A Class', 'A CLASS', ' A Clas', ' a class', 'A Class', 'A', ' A'],
    "B_CLASS": ['B-Class', 'b class', 'B CLASS', ' b class', ' B Clas', ' B Class', ' B CLASS', ' b class', 'B Class'],
    "C_CLASS": ['C-Class', 'C Clas', 'C CLASS', 'c class', ' c class', ' C Class', ' C Clas', ' c clas', ' C Class', 'C CLASS', ' c class', ' C CLAS', ' C CLASS', 'C Class'],
    "E_CLASS": ['E-Class', 'E CLASS', ' E Clas', ' e class', 'E Class', 'E-Class', ' E Class', 'E CLASS', ' e class', ' E Class', 'E CLASS', ' e class', ' E CLASS', 'e class', ' R Class'],
    "S_CLASS": ['S-Class', 'S Class', 'S CLASS', ' s class', ' S Clas', ' S Class', 'S CLASS', ' S CLASS'],
    "CLA_CLASS": ['CLA-Class', ' CLA CLASS', 'CLA CLASS', 'CLA Class', ' cla class', ' CLA Class', ' CLA Class', ' CLA Clas'],
    "CLS_CLASS": ['CLS-Class', 'CLS Class', 'CLS CLASS', ' cls class', ' CL Class', ' cl class', ' CL CLASS', ' CL Clas', ' CLS Class', ' CLS CLASS', ' CLS Clas'],
    "GLA_CLASS": ['GLA-Class', ' GLA CLASS', 'GLA CLASS', ' gla class', ' GLA Class', ' GLA Clas', 'GLA Class'],
    "GLB_CLASS": ['GLB-Class', ' glb class', 'GLB Class', ' GLB Class'],
    "GLC_CLASS": ['GLC-Class', 'GLC CLASS', ' glc class', ' GLC Class', ' glc clas', ' GLC Class', ' GLC CLASS', ' GLC Clas', 'GLC Class'],
    "GLE_CLASS": ['GLE-Class', ' GLE CLASS', 'GLE CLASS', 'GLE Class', ' gle class', ' GLE Clas', ' GLE Class', 'GLE CLASS'],
    "GLS_CLASS": ['GLS-Class', 'GLS CLASS', ' GLS Class', ' GLS CLASS', ' GLS Clas', ' gls class'],
    "GL_CLASS": ['GL-Class', 'GL Class', ' GL Class', ' GL CLASS', ' gl class', ' GL Clas'],
    "G_CLASS": ['G-Class', ' G Class', ' G CLAS'],
    "V_CLASS": ['V-Class', 'V CLASS', ' v class', ' V Clas', ' V Class', ' V CLASS', 'V Class'],
    "X_CLASS": ['X-Class', ' x-class', ' x-clas', ' X Clas', ' X-CLASS', ' X-CLAS'],
    "SL_CLASS": ['SL-Class', ' SL CLASS', 'SL CLASS', ' sl class', ' SL CLAS', ' SL CLASS', ' SL'],
    "SLK_CLASS": ['SLK-Class', 'SLK', ' slk', ' SLK'],
    "M_CLASS": ['M-Class', ' M CLASS', ' m class', 'M Class', ' M Class', ' M Clas', ' M CLAS', ' M'],
    "CLK": ['CLK', ' CLK'],
    "CL_CLASS": ['CL Class'],
    "CLC_CLASS": ['CLC Class', ' CLC Class'],
    "MERCEDES200": ['200', ' 200'],
    "MERCEDES230": ['230', ' 230'],
    "MERCEDES220": ['220', ' 220'],   "YARIS": ['Yaris', 'yaris', ' yaris', 'Yari', ' YARIS', ' Yari', ' Yaris', ' yari', ' Yari', ' YARI'],
    "AYGO": ['Aygo', ' AYGO', ' aygo', 'aygo', ' Ayg', ' AYG', ' ayg', ' Aygo'],
    "COROLLA": ['Corolla', ' COROLLA', ' corolla', ' Coroll', ' Corolla', 'corolla'],
    "C_HR": ['C-HR', ' c-hr', ' C-HR', ' C-H', ' c-h'],
    "AVENSIS": ['Avensis', ' AVENSIS', ' avensis', ' Avensis'],
    "PRIUS": ['Prius', ' PRIUS', ' prius', ' Prius'],
    "RAV4": ['RAV4', ' RAV', ' rav', ' rav4', ' RAV4', 'rav4', 'RAV'],
    "HILUX": ['Hilux', ' hilux', ' Hilu', ' Hilux', ' HILU'],
    "VERSO": ['Verso', ' verso', ' VERSO', ' Verso-S', ' Vers', ' Verso'],
    "SUPRA": ['Supra', ' Supra', 'Supra'],
    "LAND_CRUISER": ['Land Cruiser', ' Land Cruise', ' Land Cruiser', ' land cruiser'],
    "CAMRY": ['Camry', ' Camry', ' Camr'],
    "PROACE_VERSO": ['Proace Verso', ' PROACE VERSO', ' proace verso'],
    "URBAN_CRUISER": ['Urban Cruiser', ' Urban Cruiser', ' Urban Cruise'],
    "AURIS": ['Auris', ' auris', ' AURI', ' AURIS', ' Auri', ' Auris'],
    "GT86": [' GT86', 'GT86', ' gt86'], "I10": ['i10', ' i10', 'I10', ' I1', ' I10', ' I1'],
    "I20": ['i20', 'I20', ' I20', ' i20', ' I2'],
    "I30": ['i30', ' i30', 'I30', ' I30', 'I3', ' i3', 'i3', ' I3'],
    "I40": ['i40', 'I40', ' i40', ' I40', ' I40', ' I4'],
    "I800": ['i800', ' I80', 'I800', ' i800', ' I800', 'i8', ' I8', ' i8'],
    "IONIQ": ['Ioniq', ' ioniq', ' IONIQ', ' Ioni', ' Ioniq', ' IQ', 'IONIQ'],
    "KONA": ['Kona', ' KONA', ' kona', ' KON', ' Kona', ' Kon'],
    "TUCSON": ['Tucson', 'Tucso', ' tucson', ' Tucso', ' TUCSON', ' TUCSO', ' Tucso', ' TUCSO', ' TUCSON', ' Tucson'],
    "SANTA_FE": ['Santa Fe', ' santa fe', ' Santa Fe', ' SANTA FE', ' Santa F'],
    "GETZ": ['Getz', ' Getz'],
    "IX20": ['ix20', 'IX20', ' IX20', ' ix20'],
    "IX35": ['ix35', ' IX35', 'IX35', ' ix35'],
    "VELOSTER": ['Veloste', ' Veloste', ' Veloster'],
    "ACCENT": ['Accent', ' Accent'],
    "TERRACAN": ['Terracan', ' Terracan'],  "FABIA": ['Fabia', ' FABIA', ' fabia', ' Fabi', ' Fabia', 'FABIA'],
    "OCTAVIA": ['Octavia', ' OCTAVIA', ' octavia', 'octavia', 'Octavi', ' octavia', ' Octavi', ' Octavia', ' octavi', ' OCTAVI'],
    "SUPERB": ['Superb', ' superb', ' SUPERB', ' super', ' Super', ' Superb'],
    "KAROQ": ['Karoq', ' karoq', ' Karo', ' KAROQ', ' Karoq'],
    "KODIAQ": ['Kodiaq', 'kodiaq', ' kodiaq', ' KODIAQ', ' Kodia', ' Kodiaq'],
    "KAMIQ": ['Kamiq', ' KAMIQ', ' kamik', ' kamiq', ' KAMI', ' Kamiq'],
    "YETI": ['Yeti', ' yeti', ' Yeti Outdoo', ' yeti outdoor', ' Yeti Outdoor', 'Yeti Outdoor', ' YETI OUTDOOR', ' Yet', ' Yeti', ' yeti outdoo',' YETI'],
    "SCALA": ['Scala', ' scala', ' Scal', ' SCALA', ' scal', ' Scala'],
    "RAPID": ['Rapid', ' rapid', ' Rapi', ' Rapi', ' Rapid'],
    "CITIGO": ['Citigo', ' citigo', ' CITIGO', ' Citig', ' Citigo'],
    "ROOMSTER": ['Roomster', ' Roomste', ' Roomster'],  
    "LEON": ['Leon'],
    "ATECA": ['Ateca'],
    "TOLEDO": ['Toledo'],
    "ARONA": ['Arona'],
    "IBIZA": ['Ibiza'],
    "ALHAMBRA": ['Alhambra'], 
    "SHUTTLE": ['Shuttle', ' Shuttle', ' shuttle', ' SHUTTLE'],
    "KADJAR": ['Kadjar', ' Kadjar']
}


for model, typos in car_models.items():
    df_test['model'] = df_test['model'].apply(lambda x: model if x in typos else x )

In [126]:
print(df_test["model"].unique())


['I30' 'TIGUAN' 'SERIES_2' 'GRANDLAND_X' 'SERIES_1' 'FIESTA' 'X1'
 'B_CLASS' 'FOCUS' 'SUPERB' 'SERIES_5' 'C_CLASS' 'UP' 'AYGO' 'GOLF'
 'M_CLASS' 'LAND_CRUISER' 'TT' 'ADAM' 'ZAFIRA' 'E_CLASS' 'SERIES_3' 'IX20'
 'A4' 'YARIS' 'PASSAT' 'I10' 'MOKKA' 'ECOSPORT' 'SERIES_4' 'A7' 'CORSA'
 'KUGA' 'GRAND_C_MAX' 'Q2' 'M4' 'A_CLASS' 'RAV4' 'FABIA' 'INSIGNIA' 'A1'
 'X6' 'MERIVA' 'CARAVELLE' 'OCTAVIA' 'AURIS' 'X_CLASS' 'ASTRA' 'V_CLASS'
 'POLO' 'KAROQ' 'SHUTTLE' 'Q5' 'TUCSON' 'A3' 'SL_CLASS' 'COROLLA'
 'KA_PLUS' 'X3' 'I40' 'I20' 'KAMIQ' nan 'IX35' 'CROSSLAND_X' 'Q3' 'VIVA'
 'GLA_CLASS' 'CLS_CLASS' 'KA' 'GALAXY' 'X2' 'KODIAQ' 'GLC_CLASS' 'VIVARO'
 'MONDEO' 'TOURAN' 'X5' 'VERSO' 'TOUAREG' 'T_ROC' 'A5' 'S_CLASS'
 'SCIROCCO' 'X7' 'B_MAX' 'SERIES_8' 'A6' 'SANTA_FE' 'GL_CLASS' 'GLE_CLASS'
 'BEETLE' 'X4' 'SHARAN' 'C_MAX' 'M6' 'A8' 'CLA_CLASS' 'KONA' 'CITIGO'
 'PRIUS' 'CL_CLASS' 'RAPID' 'FUSION' 'G_CLASS' 'SLK_CLASS' 'YETI' 'Q7'
 'C_HR' 'M5' 'T_CROSS' 'AVENSIS' 'IONIQ' 'AMAROK' 'Z4' 'CALIFORNIA' 'M2'
 'M3' 

In [128]:
#typos in transmission
df_test["transmission"]=df_test["transmission"].apply(lambda i: "Semi-Auto" if i in ["Semi_Aut","Semi_Auto","Semi-Aut","SEMI-AUTO","EMI-AUTO","semi-auto","semi-aut","SEMI-AUT","emi-Auto","emi-Aut","emi-auto","Semi-aut"] else i)
df_test["transmission"]=df_test["transmission"].apply(lambda i: "Automatic" if i in ["AUTOMATIC","Automati","utomatic","UTOMATIC","automatic","AUTOMATI","automati","utomati"] else i)
df_test["transmission"]=df_test["transmission"].apply(lambda i: "Manual" if i in ['anual', 'manual', 'Manua', 'MANUAL', ' Manual ', 'ANUAL', 'unknow','manua', 'anua', 'MANUA', ' manual ', ' MANUAL ', ' Manual', 'Manual ', 'manual '] else i)

# Corrigir "nan" falsos
df_test.loc[df_test["transmission"].isin(["nan", "none", "null","unknown","UNKNOWN","nknown","nknow","Other"]), "transmission"] = np.nan

# Ver resultado final
df_test["transmission"].unique()

array(['Automatic', 'Semi-Auto', 'Manual', nan], dtype=object)