In [None]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### The "str" accessor is for Series always but can be used within DFs columns to multiple things

In [None]:
chicago = pd.read_csv("original_datasets/chicago.csv").dropna(how = "all")
chicago.head()
chicago.info()

In [None]:
chicago.nunique() # Candidates for Categorical data

In [None]:
chicago["Department"] = chicago["Department"].astype("category")
chicago["Name"] = chicago["Name"].astype("string")
chicago["Position Title"] = chicago["Position Title"].astype("string")
chicago.info()

## Common String Methods: .lower(), .upper(), .title(), .len() ans the str accessor

In [None]:
"hello world".title() # Camel case of the string

In [None]:
chicago["Name"].str.lower()

In [None]:
chicago["Position Title"] = chicago["Position Title"].str.title()

In [None]:
chicago["Num_Letters"] = chicago["Department"].str.len()
chicago.head()

In [None]:
# chicago["Num_Letters"].str.upper() # This raise an error because the underlying object has to be a string

## The .str.replace() Method

In [None]:
chicago["Department"].str.replace("MGMNT","MANAGEMENT") # Not inplace

In [None]:
chicago["Employee Annual Salary"] = chicago["Employee Annual Salary"].str.replace("$","").astype(float) # Not inplace the conversion
chicago["Employee Annual Salary"].sum()
chicago["Employee Annual Salary"].nlargest()

## Filtering with string methods (str methods returning booleans)

In [None]:
mask = chicago["Position Title"].str.contains("water", case = False)
chicago[mask]

In [None]:
mask = chicago["Position Title"].str.lower().str.startswith("water")
chicago[mask]

In [None]:
mask = chicago["Position Title"].str.lower().str.endswith("ist")
chicago[mask]

## More string Methods: .strip() - .lstrip() - .rstrip()

In [None]:
chicago["Name"].str.strip()

In [None]:
chicago["Position Title"].str.lstrip()

## String Methods on Index and Columns of the DF (Index class object also contains str)

In [None]:
# Set the index columns as the name
chicago.set_index(keys="Name", inplace=True)

In [None]:
chicago.index = chicago.index.str.strip().str.title()
chicago.head(3)

In [None]:
chicago.columns = chicago.columns.str.upper()
chicago.head(3)

## Split string by characters with .str.split() Method

In [None]:
chicago.index.str.split(",").str.get(0).str.title().value_counts()

In [None]:
# Find most common first word in "Position titles"
chicago["POSITION TITLE"].str.split(" ").str.get(0).value_counts()

In [None]:
# Find the most common name removing initial letter that some names have
chicago.index.str.split(",").str.get(1).str.strip().str.split(" ").str.get(0).value_counts()

In [None]:
# Using the expand and n parameters of st.split()
chicago.index.str.split(",",expand=True)

In [None]:
chicago["POSITION TITLE"].str.split(" ", expand=True, n = 1)
chicago[["First Position Word", "Remain Words"]] = chicago["POSITION TITLE"].str.split(" ", expand=True, n = 1)
chicago.head()