### [참고] <a href="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas Cheat Sheet</a>

#### https://pandas.pydata.org/docs/user_guide/text.html

In [None]:
# 파이썬 문자열 다루기
s = "ABCDE"

s.lower()

**str 메소드**
- pandas.core.strings.StringMethods 의 별칭
- 호출할 수 없기 때문에 괄호를 사용하지 않음

In [None]:
import pandas as pd
import numpy as np

In [None]:
series = pd.Series(['Suho','AA',np.nan,'Rabbit'])
series

0      Suho
1        AA
2       NaN
3    Rabbit
dtype: object

#### 1) lower() / upper() / len()

In [None]:
# pandas 에서 문자열 수정 시 str 삽입

series.str.lower()

0      suho
1        aa
2       NaN
3    rabbit
dtype: object

In [None]:
series.str.upper()

0      SUHO
1        AA
2       NaN
3    RABBIT
dtype: object

In [None]:
series.str.len()

0    4.0
1    2.0
2    NaN
3    6.0
dtype: float64

#### 2) strip()

In [None]:
series = pd.Series(['Suho    ','AA',np.nan,'    Rabbit'])
series

0      Suho    
1            AA
2           NaN
3        Rabbit
dtype: object

In [None]:
series.str.strip()

0      Suho
1        AA
2       NaN
3    Rabbit
dtype: object

In [None]:
df = pd.DataFrame(np.random.randn(2,2), columns=[" Column A "," Column B "])
df

Unnamed: 0,Column A,Column B
0,0.610187,-0.92801
1,-2.773809,0.957417


In [None]:
# 컬럼 추출

df.columns

Index([' Column A ', ' Column B '], dtype='object')

In [None]:
# df["Column A"] # 공백 있을 시 error
df[' Column A ']

0    0.610187
1   -2.773809
Name:  Column A , dtype: float64

In [None]:
# 컬럼명 공백 제거

df.columns.str.strip()

Index(['Column A', 'Column B'], dtype='object')

In [None]:
# 컬럼명 소문자로 변경

df.columns.str.lower()

Index([' column a ', ' column b '], dtype='object')

In [None]:
# 컬럼명 - 양쪽 공백 제거, 전체 소문자
# 컬럼명 사이의 간격을 _로 변경

df.columns = df.columns.str.strip().str.lower().str.replace(" ","_")
df.columns

Index(['column_a', 'column_b'], dtype='object')

In [None]:
df

Unnamed: 0,column_a,column_b
0,0.610187,-0.92801
1,-2.773809,0.957417


#### 3) split()

In [None]:
ser = pd.Series(["ha_a_b","hi_c_d",np.nan,"ho_e_f"])
ser

0    ha_a_b
1    hi_c_d
2       NaN
3    ho_e_f
dtype: object

In [None]:
list1 = ser.str.split("_")
list1

0    [ha, a, b]
1    [hi, c, d]
2           NaN
3    [ho, e, f]
dtype: object

In [None]:
for item in list1:
  print(item)

['ha', 'a', 'b']
['hi', 'c', 'd']
nan
['ho', 'e', 'f']


In [None]:
list1[0]

['ha', 'a', 'b']

In [None]:
ser.str.split("_")[0]

['ha', 'a', 'b']

In [None]:
ser.str.split("_")[1]

['hi', 'c', 'd']

* **expand=True** 결과를 데이터 프레임으로 돌려줌

In [None]:
ser.str.split("_",expand=True)

Unnamed: 0,0,1,2
0,ha,a,b
1,hi,c,d
2,,,
3,ho,e,f


#### 4) replace()

In [None]:
ser = pd.Series(["Suho","bAAa",np.nan,"cute_dog"])
ser

0        Suho
1        bAAa
2         NaN
3    cute_dog
dtype: object

In [None]:
# replace(원본문자열, 변경할형태)
# 정규식 사용 가능

# bAAa : ^.a
ser.str.replace("^.a|dog","***",case=False,regex=True)

0        Suho
1       ***Aa
2         NaN
3    cute_***
dtype: object

- ^는 입력라인의 시작
- .a 는 맨 첫 문자를 포함하고 a로 끝나는 부분을 매칭
- | or 의 의미
- dog에 해당하는 부분 매칭
- case=False 대소문자 구분 안함
- regex=True 전달된 패턴이 정규식

#### 5) cat : 텍스트 이어 붙이기

In [None]:
ser = pd.Series(['ha','hi','ho'])
ser

0    ha
1    hi
2    ho
dtype: object

In [None]:
ser.str.cat(sep=",")

'ha,hi,ho'

In [None]:
ser.str.cat()

'hahiho'

* **손실값이 있다면 연결 안함**

In [None]:
ser = pd.Series(['ha',np.nan,'ho'])
ser

0     ha
1    NaN
2     ho
dtype: object

In [None]:
ser.str.cat(sep=",")

'ha,ho'

In [None]:
ser.str.cat(sep=",",na_rep="*")

'ha,*,ho'

### [실습2] 일기 형식의 데이터 가공

In [None]:
day_plan = [
    "1st_seq : getting up at 05:45am",
    "2nd_seq : swimming from 06:00 to 07:00am",
    "3rd_seq : My morning food is American style",
    "4th_seq : Writing some proposal from 02:00pm to 06:00pm",
    "5th_seq : Arriving at JongGak at 07:00pm",
    "6th_seq : Fun with friends enjoy beer till 09:30pm",
    "7th_seq : My house at 10:30pm and sleeping by 12:00pm"
]
df = pd.DataFrame(day_plan, columns=["schedule"])
df

Unnamed: 0,schedule
0,1st_seq : getting up at 05:45am
1,2nd_seq : swimming from 06:00 to 07:00am
2,3rd_seq : My morning food is American style
3,4th_seq : Writing some proposal from 02:00pm t...
4,5th_seq : Arriving at JongGak at 07:00pm
5,6th_seq : Fun with friends enjoy beer till 09:...
6,7th_seq : My house at 10:30pm and sleeping by ...


In [None]:
# split() : 공백 기준

df['schedule'].str.split()

0               [1st_seq, :, getting, up, at, 05:45am]
1     [2nd_seq, :, swimming, from, 06:00, to, 07:00am]
2    [3rd_seq, :, My, morning, food, is, American, ...
3    [4th_seq, :, Writing, some, proposal, from, 02...
4     [5th_seq, :, Arriving, at, JongGak, at, 07:00pm]
5    [6th_seq, :, Fun, with, friends, enjoy, beer, ...
6    [7th_seq, :, My, house, at, 10:30pm, and, slee...
Name: schedule, dtype: object

In [None]:
# 각각의 분할된 문자열의 수 파악

df['schedule'].str.split().str.len()

0     6
1     7
2     8
3     9
4     7
5     9
6    10
Name: schedule, dtype: int64

In [None]:
# 문자열이 단어 My를 포함하는지 확인

df['schedule'].str.contains("My")

0    False
1    False
2     True
3    False
4    False
5    False
6     True
Name: schedule, dtype: bool

In [None]:
df[df['schedule'].str.contains("My")]

Unnamed: 0,schedule
2,3rd_seq : My morning food is American style
6,7th_seq : My house at 10:30pm and sleeping by ...


In [None]:
# 각 문자열에 숫자가 몇 개 있는지 연산
# count()

# df['schedule'].str.count("[0-9]")
df['schedule'].str.count("\d")

0    5
1    9
2    1
3    9
4    5
5    5
6    9
Name: schedule, dtype: int64

In [None]:
# 문자열에서 모든 숫자만 구하기

df['schedule'].str.findall("\d")

0                [1, 0, 5, 4, 5]
1    [2, 0, 6, 0, 0, 0, 7, 0, 0]
2                            [3]
3    [4, 0, 2, 0, 0, 0, 6, 0, 0]
4                [5, 0, 7, 0, 0]
5                [6, 0, 9, 3, 0]
6    [7, 1, 0, 3, 0, 1, 2, 0, 0]
Name: schedule, dtype: object

In [None]:
# 정규 표현식 패턴에 매칭하는 연산 결과 구하기

# 09:00
# df['schedule'].str.findall("(\d\d):(\d\d)")
df['schedule'].str.findall("(\d{2}):(\d{2})")

0              [(05, 45)]
1    [(06, 00), (07, 00)]
2                      []
3    [(02, 00), (06, 00)]
4              [(07, 00)]
5              [(09, 30)]
6    [(10, 30), (12, 00)]
Name: schedule, dtype: object