<a href="https://colab.research.google.com/github/AlcoholWolf/PY-SAVE/blob/main/EDA_And_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# | CODE SETTING

## | Import And Init

### | Data Import

In [146]:
import re 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')
plt.rc("axes", unicode_minus=False) # 음수값 깨지는 폰트 설정

### | Data Input

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
data_pile = '/content/drive/MyDrive/user_data'

In [39]:
train_data = pd.read_csv(f"{data_pile}/train.csv")
test_data = pd.read_csv(f"{data_pile}/test.csv")
submission = pd.read_csv(f"{data_pile}/sample_submission.csv")

# |  DATA SETTING

## | View

### | Data View

In [295]:
train_data.head()

Unnamed: 0,id,title,odometer,location,isimported,engine,transmission,fuel,paint,year,target
0,0,Toyota RAV 4,18277,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Red,2016,13665000
1,1,Toyota Land Cruiser,10,Lagos,New,4-cylinder(I4),automatic,petrol,Black,2019,33015000
2,2,Land Rover Range Rover Evoque,83091,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2012,9915000
3,3,Lexus ES 350,91524,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Gray,2007,3815000
4,4,Toyota Venza,94177,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2010,7385000


## | Data Bind

### | Train Data + Test Data

In [296]:
All_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
Set_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

### | Columns Replace

In [297]:
Col_list = ['ID', 'Company', 'RunTime', 'Location', 'UserUse', 'Engine', 'Mission', 'Fuel', 'Paint', 'Year', 'Each']
All_data.columns = Col_list
Set_data.columns = Col_list
All_data.columns

Index(['ID', 'Company', 'RunTime', 'Location', 'UserUse', 'Engine', 'Mission',
       'Fuel', 'Paint', 'Year', 'Each'],
      dtype='object')

## | DATA REPLACE

### | Company

In [298]:
All_data["Company"] = all_data["title"].map(lambda x: x.split()[0].upper())

In [299]:
All_data["Company"].replace("MERCEDES-BENZ/52","MERCEDES-BENZ",inplace=True)

### | log_RunTime

In [300]:
All_data["log_RunTime"]=np.log1p(All_data["RunTime"])

### | Use

In [301]:
All_data["Use"] = All_data["log_RunTime"].copy()
All_data["Use"][All_data["log_RunTime"] < 15 ] = 100
All_data["Use"][All_data["log_RunTime"] < 13 ] = 75
All_data["Use"][All_data["log_RunTime"] < 10 ] = 50
All_data["Use"][All_data["log_RunTime"] < 5 ] = 25
All_data["Use"][All_data["log_RunTime"] == 0 ] = 0

### | Year

In [302]:
All_data["OldYear"] = All_data["Year"]-2020
All_data["OldYear"][All_data["OldYear"]>0] = 0
All_data["OldYear"][All_data["OldYear"]<-100] = -10
All_data["OldYear"] = abs(All_data["OldYear"])

In [303]:
All_data["Year"].dropna()
Drop_Index = []

In [304]:
Drop_Mask = (All_data['Year']>2020) & (All_data['Year']<1900)
All_data["Year"][Drop_Mask] = float('nan')
All_data["Year"].dropna()

0       2016
1       2019
2       2012
3       2007
4       2010
        ... 
1446    2014
1447    2011
1448    2013
1449    2003
1450    2006
Name: Year, Length: 1451, dtype: int64

### | Paint

In [305]:
def clean_text(texts): 
    corpus = [] 
    for i in range(0, len(texts)): 
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>\<]', '',texts[i])
        review = re.sub(r'\d+','',review)
        review = review.lower()
        review = re.sub(r'\s+', ' ', review)
        review = re.sub(r'<[^>]+>','',review)
        review = re.sub(r'\s+', ' ', review)
        review = re.sub(r"^\s+", '', review)
        review = re.sub(r'\s+$', '', review)
        review = re.sub(r'_', ' ', review)

        corpus.append(review) 
        
    return corpus

In [306]:
All_data['Paint'] = clean_text(All_data['Paint'])

# | RGB Color Mask And Gloss Mask

In [309]:
Color_Mask = []
C_list = ['Red', 'Yellow', 'Green', 'Blue', 'White', 'Gray', 'Black', 'Silver', 'Gold', 'Wine']
E_list = [['red', 'brown', 'maroon'], ['orange', 'yellow'], ['green'], ['blue', 'purple', 'indigo ink pearl'], ['white', 'cream', 'milk', 'beige'], ['gray', 'grey', 'gary', 'gery', 'ash'], ['black', 'blac'], ['silver', 'sliver'], ['gold', 'golf'], ['wine', 'whine'] ]
for Index in range(len(C_list)):
    Color_Mask.append([])
    for Err in E_list[Index]:
        Color_Mask[Index] = All_data['Paint'].str.contains(f'{Err}')
        All_data['Paint'][Color_Mask[Index]] = C_list[Index]

In [292]:
All_data["Light"] = All_data["Paint"].copy()
Matt_Mask = []
G_list = ['Silver', 'Gold', 'Wine']
K_list = ['Gray', 'Yellow', 'Red']
for Index in range(len(G_list)):
    Matt_Mask.append([])
    for Rep in G_list:
        Matt_Mask[Index] = All_data['Paint'].str.contains(f'{Rep}')
        All_data['Paint'][Matt_Mask[Index]] = K_list[Index]
        All_data['Light'] = 'Matte'
        All_data['Light'][Matt_Mask[Index]] = 'Gloss'

In [293]:
All_data['Light'].unique()

array(['Matte'], dtype=object)

## | Data View

In [310]:
All_data

Unnamed: 0,ID,Company,RunTime,Location,UserUse,Engine,Mission,Fuel,Paint,Year,Each,log_RunTime,Use,OldYear
0,0,TOYOTA,18277,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Red,2016,13665000.0,9.813453,50.0,4
1,1,TOYOTA,10,Lagos,New,4-cylinder(I4),automatic,petrol,Black,2019,33015000.0,2.397895,25.0,1
2,2,LAND,83091,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2012,9915000.0,11.327704,75.0,8
3,3,LEXUS,91524,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Gray,2007,3815000.0,11.424367,75.0,13
4,4,TOYOTA,94177,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2010,7385000.0,11.452942,75.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1446,431,MERCEDES-BENZ,78175,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Blue,2014,,11.266718,75.0,6
1447,432,HONDA,129223,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2011,,11.769303,75.0,9
1448,433,MERCEDES-BENZ,100943,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Black,2013,,11.522321,75.0,7
1449,434,LEXUS,81463,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Green,2003,,11.307916,75.0,17
