# Ch06 텍스트 데이터 다루기

In [1]:
import pandas as pd

## 6.1 대소문자 변형과 공백

In [2]:
inspections = pd.read_csv('../../DATA/chicago_food_inspections.csv')
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


In [3]:
inspections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153810 entries, 0 to 153809
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    153810 non-null  object
 1   Risk    153744 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [4]:
inspections['Name'].str

<pandas.core.strings.accessor.StringMethods at 0x190895764c0>

In [5]:
inspections['Name']

0                 MARRIOT MARQUIS CHICAGO   
1                                JETS PIZZA 
2                                 ROOM 1520 
3                  MARRIOT MARQUIS CHICAGO  
4                              CHARTWELLS   
                         ...                
153805                           WOLCOTT'S  
153806       DUNKIN DONUTS/BASKIN-ROBBINS   
153807                             Cafe 608 
153808                          mr.daniel's 
153809                           TEMPO CAFE 
Name: Name, Length: 153810, dtype: object

In [6]:
inspections['Name'].str.strip()

0              MARRIOT MARQUIS CHICAGO
1                           JETS PIZZA
2                            ROOM 1520
3              MARRIOT MARQUIS CHICAGO
4                           CHARTWELLS
                      ...             
153805                       WOLCOTT'S
153806    DUNKIN DONUTS/BASKIN-ROBBINS
153807                        Cafe 608
153808                     mr.daniel's
153809                      TEMPO CAFE
Name: Name, Length: 153810, dtype: object

In [7]:
inspections['Name'] = inspections['Name'].str.strip()

In [8]:
inspections['Name']

0              MARRIOT MARQUIS CHICAGO
1                           JETS PIZZA
2                            ROOM 1520
3              MARRIOT MARQUIS CHICAGO
4                           CHARTWELLS
                      ...             
153805                       WOLCOTT'S
153806    DUNKIN DONUTS/BASKIN-ROBBINS
153807                        Cafe 608
153808                     mr.daniel's
153809                      TEMPO CAFE
Name: Name, Length: 153810, dtype: object

In [9]:
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()

In [10]:
inspections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153810 entries, 0 to 153809
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    153810 non-null  object
 1   Risk    153744 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [11]:
inspections['Name'].str.title()

0              Marriot Marquis Chicago
1                           Jets Pizza
2                            Room 1520
3              Marriot Marquis Chicago
4                           Chartwells
                      ...             
153805                       Wolcott'S
153806    Dunkin Donuts/Baskin-Robbins
153807                        Cafe 608
153808                     Mr.Daniel'S
153809                      Tempo Cafe
Name: Name, Length: 153810, dtype: object

## 6.2 문자열 슬라이싱

In [14]:
inspections.nunique()

Name    24685
Risk        4
dtype: int64

In [15]:
inspections['Risk'].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [16]:
# 'Risk' 열에서 NaN 값은 제거
inspections = inspections.dropna(subset=['Risk'])

In [17]:
inspections['Risk'].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

In [18]:
# 'Risk' 열에서 'All' 문자열을 'Risk 4 (Extreme)'으로 치환
inspections = inspections.replace(to_replace='All', value='Risk 4 (Extreme)')

In [19]:
inspections['Risk'].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

## 6.3 문자열 슬라이싱과 문자 치환

In [20]:
inspections['Risk'].str.slice(5, 6).head()

0    1
1    2
2    3
3    1
4    1
Name: Risk, dtype: object

In [22]:
inspections['Risk'].str[5:6].head()

0    1
1    2
2    3
3    1
4    1
Name: Risk, dtype: object

In [23]:
inspections['Risk'].str[8:].head()

0      High)
1    Medium)
2       Low)
3      High)
4      High)
Name: Risk, dtype: object

In [24]:
inspections['Risk'].str[8:-1].head()

0      High
1    Medium
2       Low
3      High
4      High
Name: Risk, dtype: object

## 6.4 불리언 메서드

In [25]:
'Pizza' in 'Jets Pizza'

True

In [26]:
'Pizza' in 'JETS PIZZA'

False

In [27]:
inspections['Name'].str.lower().str.contains("pizza").head()

0    False
1     True
2    False
3    False
4    False
Name: Name, dtype: bool

In [28]:
has_pizza = inspections['Name'].str.lower().str.contains("pizza")
inspections[has_pizza]

Unnamed: 0,Name,Risk
1,JETS PIZZA,Risk 2 (Medium)
19,NANCY'S HOME OF STUFFED PIZZA,Risk 1 (High)
27,"NARY'S GRILL & PIZZA ,INC.",Risk 1 (High)
29,NARYS GRILL & PIZZA,Risk 1 (High)
68,COLUTAS PIZZA,Risk 1 (High)
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,Risk 1 (High)
153764,COCHIAROS PIZZA #2,Risk 1 (High)
153772,FERNANDO'S MEXICAN GRILL & PIZZA,Risk 1 (High)
153788,REGGIO'S PIZZA EXPRESS,Risk 1 (High)


In [30]:
# 'tacos' 문자열로 시작하는 식당들 필터링
starts_with_tacos = inspections['Name'].str.lower().str.startswith('tacos')
inspections[starts_with_tacos]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,Risk 1 (High)
556,TACOS EL TIO 2 INC.,Risk 1 (High)
675,TACOS DON GABINO,Risk 1 (High)
958,TACOS EL TIO 2 INC.,Risk 1 (High)
1036,TACOS EL TIO 2 INC.,Risk 1 (High)
...,...,...
143587,TACOS DE LUNA,Risk 1 (High)
144026,TACOS GARCIA,Risk 1 (High)
146174,Tacos Place's 1,Risk 1 (High)
147810,TACOS MARIO'S LIMITED,Risk 1 (High)


In [31]:
# 'tacos' 문자열로 끝나는 식당들 필터링
ends_with_tacos = inspections['Name'].str.lower().str.endswith('tacos')
inspections[ends_with_tacos]

Unnamed: 0,Name,Risk
382,LAZO'S TACOS,Risk 1 (High)
569,LAZO'S TACOS,Risk 1 (High)
2652,FLYING TACOS,Risk 3 (Low)
3250,JONY'S TACOS,Risk 1 (High)
3812,PACO'S TACOS,Risk 1 (High)
...,...,...
151121,REYES TACOS,Risk 1 (High)
151318,EL MACHO TACOS,Risk 1 (High)
151801,EL MACHO TACOS,Risk 1 (High)
153087,RAYMOND'S TACOS,Risk 1 (High)
