In [1]:
import pandas as pd
from io import StringIO

In [2]:
csv_data = '''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
9.0,10.0,11.0,
13.0,14.0,15.0,16.0
'''

In [3]:
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [4]:
type(df)

pandas.core.frame.DataFrame

In [5]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True
3,False,False,False,False


In [6]:
df.isnull().sum(axis=0)

A    0
B    0
C    1
D    1
dtype: int64

In [7]:
df.isnull().sum(axis=1)

0    0
1    1
2    1
3    0
dtype: int64

In [8]:
df.dropna(axis = 0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


In [9]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,9.0,10.0
3,13.0,14.0


In [10]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [11]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [None]:
df.dropna(thresh = 4)

In [12]:
df.dropna(subset = ['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [13]:
df.dropna(subset = ['C','D'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


In [15]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [17]:
imr = imr.fit(df.values)

In [19]:
imputed_data = imr.transform(df.values)

In [20]:
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  9.66666667,  8.        ],
       [ 9.        , 10.        , 11.        ,  9.33333333],
       [13.        , 14.        , 15.        , 16.        ]])

In [21]:
# 다른 STRATEGY 사용 
# fit + transform 
imr2 = SimpleImputer(missing_values = np.nan
                     , strategy = 'constant'
                     , fill_value = 100)
imputed_data2 = imr2.fit_transform(df.values)
imputed_data2

array([[  1.,   2.,   3.,   4.],
       [  5.,   6., 100.,   8.],
       [  9.,  10.,  11., 100.],
       [ 13.,  14.,  15.,  16.]])

In [23]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']
])

In [24]:
df.head()

Unnamed: 0,0,1,2,3
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [25]:
df.columns = ['color','size', 'price', 'label']
df


Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [26]:
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}

In [28]:
df['size'] = df['size'].map(size_mapping)

In [29]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [30]:
inv_size_mapping = {v:k for k ,v in size_mapping.items()}
inv_size_mapping

{1: 'M', 2: 'L', 3: 'XL'}

In [31]:
size_mapping.items()

dict_items([('XL', 3), ('L', 2), ('M', 1)])

In [32]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [33]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [35]:
unique_labels = np.unique(df['label'])
unique_labels

array(['class1', 'class2'], dtype=object)

In [36]:
class_mapping = {label : idx for idx, label in enumerate(unique_labels)}
class_mapping

{'class1': 0, 'class2': 1}

In [37]:
df['label'] = df['label'].map(class_mapping)
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [38]:
inv_class_mapping = {v:k for k,v in class_mapping.items()}

In [40]:
df['label'].map(inv_class_mapping)

0    class1
1    class2
2    class1
Name: label, dtype: object

In [41]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()

In [44]:
y = class_le.fit_transform(df['label'].values)
y

array([0, 1, 0])

In [46]:
class_le.inverse_transform(y)

array([0, 1, 0])

In [48]:
from sklearn.preprocessing import OneHotEncoder

In [49]:
df   

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [50]:
X = df[['color','size','price']].values

In [51]:
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [52]:
color_le = LabelEncoder()

In [53]:
X[:, 0 ] = color_le.fit_transform(X[:,0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [54]:
# 기존 코드 오류 발생
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

TypeError: ignored

In [55]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ohe =ColumnTransformer([( "color",OneHotEncoder(),[0])], remainder = 'passthrough' ) 
ohe.fit_transform(X)

array([[0.0, 1.0, 0.0, 1, 10.1],
       [0.0, 0.0, 1.0, 2, 13.5],
       [1.0, 0.0, 0.0, 3, 15.3]], dtype=object)

In [57]:
pd.get_dummies(df[['color','size','price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


In [56]:
pd.get_dummies(df[['color','size','price']],drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,1,0
1,2,13.5,0,1
2,3,15.3,0,0


In [None]:
# 최소한의 조건 만족 1. 숫자로만 이루어져 있다. 2. 모든 값이 있다. 



In [58]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' , header=None)

In [59]:
df_wine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [60]:
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [61]:
np.unique(df_wine['Class label'])

array([1, 2, 3])

In [62]:
df_wine.head(20)

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
5,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [64]:
df_wine.shape

(178, 14)

In [65]:
from sklearn.model_selection import train_test_split

In [66]:
X = df_wine.iloc[:,1:].values
y = df_wine.iloc[:,0].values

In [67]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 0, stratify = y  ) 

In [68]:
X_train.shape

(124, 13)

In [69]:
X_test.shape

(54, 13)

In [70]:
y_train.shape

(124,)

In [71]:
y_test.shape

(54,)

In [None]:
123

In [72]:
ex = np.array([0,1,2,3,4,5])

In [73]:
(ex - ex.min())/(ex.max() - ex.min())

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [74]:
(ex - ex.mean()) / ex.std()

array([-1.46385011, -0.87831007, -0.29277002,  0.29277002,  0.87831007,
        1.46385011])

In [75]:
from sklearn.preprocessing import MinMaxScaler

In [76]:
data = [[-1, 2], 
        [-0.5, 6], 
        [0, 10], 
        [1, 18]]

In [77]:
mms = MinMaxScaler()

In [78]:
mms.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [79]:
from sklearn.preprocessing import StandardScaler

In [81]:
stds = StandardScaler()
stds.fit_transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])