In [1]:
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [4]:
df.isnull().sum() / len(df)

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
dtype: float64

In [3]:
df_no_missing=df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [5]:
df_cleaned=df.dropna(how='all') #모든 데이터가 비어있을 때 drop
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [7]:
df['location'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [8]:
df.dropna(axis=1, how='all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [13]:
df.dropna(axis=1, thresh=3) #데이터가 4개 이상 없으면 drop

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [14]:
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [15]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


In [16]:
df["preTestScore"].mean()

3.0

In [17]:
df["postTestScore"].median()

62.0

In [18]:
df["postTestScore"].mode()

0    25.0
1    62.0
2    70.0
dtype: float64

In [19]:
df["preTestScore"]

0    4.0
1    NaN
2    NaN
3    2.0
4    3.0
Name: preTestScore, dtype: float64

In [20]:
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)

In [21]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [23]:
df.groupby("sex")["postTestScore"].sum()

sex
f    70.0
m    87.0
Name: postTestScore, dtype: float64

In [24]:
df.groupby("sex")["postTestScore"].transform("mean")#비어있는지 여부와 관계없이 변경

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [25]:
df["postTestScore"].fillna(    #비어있는 경우에만 변경
    df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [26]:
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [30]:
edges=pd.DataFrame({'source':[0,1,2],
                   'target':[2,2,3],
                   'weight':[3,4,5],
                   'color':['red','blue','blue']})
edges

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


In [31]:
edges['color']

0     red
1    blue
2    blue
Name: color, dtype: object

In [34]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [35]:
pd.get_dummies(edges["color"])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [40]:
weight_dict={3:"M", 4:"L", 5:"XL"}
edges["weight_sign"]=edges["weight"].map(weight_dict)
edges

Unnamed: 0,source,target,weight,color_blue,color_red,weight_sign_L,weight_sign_M,weight_sign_XL,weight_sign
0,0,2,3,0,1,0,1,0,M
1,1,2,4,1,0,1,0,0,L
2,2,3,5,1,0,0,0,1,XL


In [41]:
weight_sign=pd.get_dummies(edges["weight_sign"])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [42]:
pd.concat([edges, weight_sign], axis=1)

Unnamed: 0,source,target,weight,color_blue,color_red,weight_sign_L,weight_sign_M,weight_sign_XL,weight_sign,L,M,XL
0,0,2,3,0,1,0,1,0,M,0,1,0
1,1,2,4,1,0,1,0,0,L,1,0,0
2,2,3,5,1,0,0,0,1,XL,0,0,1


In [43]:
pd.get_dummies(edges).values

array([[0, 2, 3, 0, 1, 0, 1, 0, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1, 0, 0, 1]], dtype=int64)

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [45]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [46]:
bins = [0, 25, 50, 75, 100]
group_names = ['Low','Okay','Good','Great']
categories = pd.cut(df['postTestScore'], bins, labels=group_names)
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

In [49]:
df['categories']=pd.cut(df['postTestScore'], bins, labels=group_names)
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,categories
0,Nighthawks,1st,Miller,4,25,Low
1,Nighthawks,1st,Jacobson,24,94,Great
2,Nighthawks,2nd,Ali,31,57,Good
3,Nighthawks,2nd,Milner,2,62,Good
4,Dragoons,1st,Cooze,3,70,Good
5,Dragoons,1st,Jacon,4,25,Low
6,Dragoons,2nd,Ryaner,24,94,Great
7,Dragoons,2nd,Sone,31,57,Good
8,Scouts,1st,Sloan,2,62,Good
9,Scouts,1st,Piger,3,70,Good


In [50]:
pd.value_counts(df['categories'])

Good     8
Great    2
Low      2
Okay     0
Name: categories, dtype: int64

In [51]:
pd.get_dummies(df)

Unnamed: 0,preTestScore,postTestScore,regiment_Dragoons,regiment_Nighthawks,regiment_Scouts,company_1st,company_2nd,name_Ali,name_Cooze,name_Jacobson,...,name_Milner,name_Piger,name_Riani,name_Ryaner,name_Sloan,name_Sone,categories_Low,categories_Okay,categories_Good,categories_Great
0,4,25,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,24,94,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,31,57,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2,62,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,3,70,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,4,25,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,24,94,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7,31,57,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8,2,62,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
9,3,70,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [59]:
raw_example = df.values
data = raw_example.copy()
data

array([['Nighthawks', '1st', 'Miller', 4, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', 31, 57, 'Good'],
       ['Nighthawks', '2nd', 'Milner', 2, 62, 'Good'],
       ['Dragoons', '1st', 'Cooze', 3, 70, 'Good'],
       ['Dragoons', '1st', 'Jacon', 4, 25, 'Low'],
       ['Dragoons', '2nd', 'Ryaner', 24, 94, 'Great'],
       ['Dragoons', '2nd', 'Sone', 31, 57, 'Good'],
       ['Scouts', '1st', 'Sloan', 2, 62, 'Good'],
       ['Scouts', '1st', 'Piger', 3, 70, 'Good'],
       ['Scouts', '2nd', 'Riani', 2, 62, 'Good'],
       ['Scouts', '2nd', 'Ali', 3, 70, 'Good']], dtype=object)

In [60]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder() #인코더 생성
le.fit(raw_example[:,0]) #데이터에 맞게 인코딩 피팅(기준을 세움)
le.transform(raw_example[:,0]) #실제 데이터 라벨링(데이터 변환)

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2])

In [63]:
label_column=[0,1,2,5]
label_encoder_list=[]
for column_index in label_column:
    le=preprocessing.LabelEncoder()
    le.fit(raw_example[:,column_index])
    data[:,column_index]=le.transform(raw_example[:,column_index])
    label_encoder_list.append(le)
    del le
data[:3]

LabelEncoder()
LabelEncoder()
LabelEncoder()
LabelEncoder()


array([[1, 0, 4, 4, 25, 2],
       [1, 0, 2, 24, 94, 1],
       [1, 1, 0, 31, 57, 0]], dtype=object)

In [62]:
label_encoder_list[0].transform(raw_example[:10,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2])

In [64]:
one_hot_enc = preprocessing.OneHotEncoder()
data[:,0].reshape(-1,1) #two dimension으로 변경

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [65]:
one_hot_enc.fit(data[:,0].reshape(-1,1)) #기준 생성

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [73]:
one_hot_enc.active_features_

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [74]:
df = pd.DataFrame({
        'A':[14.00,90.20,90.95,96.27,91.21],
        'B':[103.02,107.26,110.35,114.23,114.68], 
        'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


In [76]:
df["A"]=(df["A"]-df["A"].min())/(df["A"].max()-df["A"].min())*(5-1)+1
df

Unnamed: 0,A,B,C
0,1.0,103.02,big
1,4.704874,107.26,small
2,4.741339,110.35,big
3,5.0,114.23,small
4,4.753981,114.68,small


In [77]:
df["B"]=(df["B"]-df["B"].mean())/(df["B"].std())
df

Unnamed: 0,A,B,C
0,1.0,-1.40525,big
1,4.704874,-0.54023,small
2,4.741339,0.090174,big
3,5.0,0.881749,small
4,4.753981,0.973556,small


In [78]:
df=pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,
     usecols=[0,1,2]
    )
df.columns=['Class label', 'Alcohol', 'Malic acid']
df

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.20,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59
...,...,...,...
173,3,13.71,5.65
174,3,13.40,3.91
175,3,13.27,4.28
176,3,13.17,2.59


In [83]:
std_scale=preprocessing.StandardScaler().fit(df[['Alcohol', 'Malic acid']])
df_std=std_scale.transform(df[['Alcohol', 'Malic acid']])
df_std[:5]

array([[ 1.51861254, -0.5622498 ],
       [ 0.24628963, -0.49941338],
       [ 0.19687903,  0.02123125],
       [ 1.69154964, -0.34681064],
       [ 0.29570023,  0.22769377]])

In [84]:
minmax_scale=preprocessing.MinMaxScaler().fit(df[['Alcohol', 'Malic acid']])
df_minmax=minmax_scale.transform(df[['Alcohol', 'Malic acid']])
df_minmax[:3]

array([[0.84210526, 0.1916996 ],
       [0.57105263, 0.2055336 ],
       [0.56052632, 0.3201581 ]])