In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [4]:
transform = lambda x : x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [5]:
data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

In [6]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [7]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [9]:
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [10]:
data.rename(index={'OHIO':'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [11]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [12]:
# 18~25
# 26~35
# 35~60
# 60이상
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [13]:
cats.codes #0~3 까지의 값으로 변경

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [14]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [15]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [17]:
 # right=False 는 / 왼쪽은 포함 / 오른쪽은 미포함
pd.cut(ages,[18,26,36,61,100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [46]:
group_names= ['Youth','YoungAdult','MiddleAged', 'Senior']

In [47]:
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [48]:
data = np.random.rand(20) # 0~1 사이 랜덤값 20개
data

array([0.44370579, 0.79193043, 0.37366376, 0.65866502, 0.19319518,
       0.49824657, 0.25413583, 0.36344019, 0.42416064, 0.62143686,
       0.37434839, 0.63845651, 0.8191683 , 0.37090163, 0.16235125,
       0.79681173, 0.83989097, 0.55233413, 0.96538765, 0.28737309])

In [49]:
pd.cut(data, 4, precision=2) # 소수점 2자리까지 표시하라

[(0.36, 0.56], (0.76, 0.97], (0.36, 0.56], (0.56, 0.76], (0.16, 0.36], ..., (0.76, 0.97], (0.76, 0.97], (0.36, 0.56], (0.76, 0.97], (0.16, 0.36]]
Length: 20
Categories (4, interval[float64]): [(0.16, 0.36] < (0.36, 0.56] < (0.56, 0.76] < (0.76, 0.97]]

In [65]:
data = pd.DataFrame(np.random.randn(1000,4))
data

Unnamed: 0,0,1,2,3
0,-1.257157,2.284930,-1.547336,0.944234
1,-0.070452,-0.648378,-0.746303,-2.022010
2,1.246583,0.596281,-1.049777,-0.714953
3,-1.486662,-0.445741,-0.427190,-0.437817
4,0.820851,-0.030738,-1.287146,-0.294115
...,...,...,...,...
995,0.096511,-0.362473,0.491337,0.698800
996,1.942020,1.116874,-0.826199,0.881202
997,0.259118,-1.784964,-0.240366,0.222332
998,-1.290383,0.215625,0.586637,0.792792


In [66]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.032263,0.016289,-0.010505,0.025621
std,0.961885,0.978959,0.981468,0.986428
min,-2.869024,-3.285945,-3.813668,-3.194433
25%,-0.670792,-0.650967,-0.63138,-0.66228
50%,-0.025082,-0.016781,-0.045631,0.018268
75%,0.592749,0.679888,0.621278,0.67339
max,2.65689,3.581579,3.778719,3.767112


In [67]:
data[2]

0     -1.547336
1     -0.746303
2     -1.049777
3     -0.427190
4     -1.287146
         ...   
995    0.491337
996   -0.826199
997   -0.240366
998    0.586637
999    0.399457
Name: 2, Length: 1000, dtype: float64

In [68]:
col = data[2]
col[np.abs(col)>3]

219   -3.753118
338    3.303766
346    3.060958
593    3.778719
638    3.023022
700   -3.813668
Name: 2, dtype: float64

In [69]:
data[(np.abs(data)>3).any(1)] # 3보다큰거 아무거나 하나라도있으면

Unnamed: 0,0,1,2,3
6,1.888899,-1.238685,-0.477588,3.16048
60,-0.840642,1.628622,1.438913,3.039066
155,0.338253,-0.134715,0.219476,-3.194433
219,-1.212295,0.475752,-3.753118,-1.545686
278,-1.087007,3.581579,-0.185899,0.742013
338,0.142786,-0.266471,3.303766,-1.202795
346,0.101731,0.369705,3.060958,-0.049217
547,-0.046255,-0.850875,-0.141968,3.767112
564,-0.644403,-3.285945,0.866651,-0.735484
593,-0.020816,0.024498,3.778719,-0.967765


In [96]:
data[np.abs(data)>3]= np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.032263,0.015993,-0.010105,0.024841
std,0.961885,0.976082,0.972079,0.982549
min,-2.869024,-3.0,-3.0,-3.0
25%,-0.670792,-0.650967,-0.63138,-0.66228
50%,-0.025082,-0.016781,-0.045631,0.018268
75%,0.592749,0.679888,0.621278,0.67339
max,2.65689,3.0,3.0,3.0


In [97]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,-1.0,-1.0,-1.0,-1.0
2,1.0,1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0
4,1.0,-1.0,-1.0,-1.0


In [98]:
data.head()

Unnamed: 0,0,1,2,3
0,-1.257157,2.28493,-1.547336,0.944234
1,-0.070452,-0.648378,-0.746303,-2.02201
2,1.246583,0.596281,-1.049777,-0.714953
3,-1.486662,-0.445741,-0.42719,-0.437817
4,0.820851,-0.030738,-1.287146,-0.294115


In [110]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [111]:
sampler = np.random.permutation(5)
sampler

array([3, 2, 1, 0, 4])

In [182]:
df.take(sampler)

Unnamed: 0,key,data1
3,c,3
2,a,2
1,b,1
0,b,0
4,a,4


In [115]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [121]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

0    5
4    4
0    5
3    6
2   -1
1    7
1    7
1    7
0    5
2   -1
dtype: int64

In [136]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [137]:
dummies=pd.get_dummies(df['key']) # one-hot 원 핫 인코딩
dummies

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [183]:
dummies=pd.get_dummies(df['key'], prefix='key' ) 
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [184]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [140]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/movies.dat', sep='::', header=None, names=mnames)
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [191]:
all_genres = []
for x in movies.genres :
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [192]:
zero_matrix = np.zeros((len(movies),len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
gen = movies.genres[0]
gen.split('|')
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [194]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [197]:
pd.set_option('display.max_columns',12) #  앞6개 (중간 ... 으로 생략) 뒤6개  12개 맞춤
# pd.set_option('display.max_row',500)
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.head(3)

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,...,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [165]:
bins = [0,0.2,0.4,0.6,0.8,1.0]
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [170]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [171]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [172]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [174]:
'guido' in val

True

In [176]:
val.index(',')

1

In [177]:
val.find(':')

-1

In [178]:
val.index(':') # : 문자없음

ValueError: substring not found

In [179]:
val.count(',')

2

In [180]:
val.replace(',','::')

'a::b::  guido'

In [181]:
val.replace(',','')

'ab  guido'