In [1]:
import numpy as np
import pandas as pd

# https://pandas.pydata.org/docs/user_guide/text.html

In [2]:
email = 'xyz=email.com'
email.split('@')

['xyz=email.com']

In [3]:
names = pd.Series(('andrew', 'bobo', 'carl', 'david'))
names

0    andrew
1      bobo
2      carl
3     david
dtype: object

In [4]:
names.str.upper()

0    ANDREW
1      BOBO
2      CARL
3     DAVID
dtype: object

In [5]:
email.isdigit()

False

In [6]:
names.str.isdigit()

0    False
1    False
2    False
3    False
dtype: bool

In [7]:
tech_finance = ['GOOG,APPL,AMZN', 'JPM,BAC,WFC']
len(tech_finance)

2

In [8]:
tickers = pd.Series(tech_finance)
tickers

0    GOOG,APPL,AMZN
1       JPM,BAC,WFC
dtype: object

In [10]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1       [JPM, BAC, WFC]
dtype: object

In [13]:
tech = tickers.str.split(',', expand=True)
tech

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,WFC


In [15]:
tech[0]

0    GOOG
1     JPM
Name: 0, dtype: object

In [16]:
tech[0][0]

'GOOG'

In [20]:
messy_names = pd.Series(['andrew ',  "bo;bo", "    carl   "])
messy_names

0        andrew 
1          bo;bo
2        carl   
dtype: object

In [24]:
names = messy_names.str.replace(';', '').str.strip().str.capitalize()
names

0    Andrew
1      Bobo
2      Carl
dtype: object

Date Time Methods in Pd

In [25]:
from datetime import datetime

In [26]:
myyear = 2015
mymonth = 2
myday = 17
myhour = 4
mymin = 45
mysec = 30

mydate = datetime(myyear, mymonth, myday)
mydate

datetime.datetime(2015, 2, 17, 0, 0)

In [27]:
mydatetime = datetime(myyear, mymonth, myday, myhour, mymin, mysec)
mydatetime

datetime.datetime(2015, 2, 17, 4, 45, 30)

In [33]:
datestr = mydate.strftime("%d-%m-%Y")
datestr

'17-02-2015'

In [37]:
myser = pd.Series(['Mar 21, 2002', '19/11/2022', None])
myser

0    Mar 21, 2002
1      19/11/2022
2            None
dtype: object

In [42]:
myser[0]

'Mar 21, 2002'

In [44]:
timser = pd.to_datetime(myser, dayfirst=True)
timser

0   2002-03-21
1   2022-11-19
2          NaT
dtype: datetime64[ns]

In [45]:
timser[0]

Timestamp('2002-03-21 00:00:00')

In [46]:
timser[0].year

2002

In [51]:
euro_date = '04-03-2002'
pd.to_datetime(euro_date)

Timestamp('2002-04-03 00:00:00')

In [52]:
pd.to_datetime(euro_date, dayfirst=True)

Timestamp('2002-03-04 00:00:00')

In [53]:
style_date = '25--Dec--2011'
pd.to_datetime(style_date, format='%d--%b--%Y')

Timestamp('2011-12-25 00:00:00')

In [54]:
custom_date = '11th of Oct 1991'
pd.to_datetime(custom_date)

Timestamp('1991-10-11 00:00:00')

In [56]:
sales = pd.read_csv('RetailSales_BeerWineLiquor.csv')

In [57]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [58]:
sales['DATE']

0      1992-01-01
1      1992-02-01
2      1992-03-01
3      1992-04-01
4      1992-05-01
          ...    
335    2019-12-01
336    2020-01-01
337    2020-02-01
338    2020-03-01
339    2020-04-01
Name: DATE, Length: 340, dtype: object

In [61]:
sales['DATE'] = pd.to_datetime(sales['DATE'])
sales[ 'DATE']

0     1992-01-01
1     1992-02-01
2     1992-03-01
3     1992-04-01
4     1992-05-01
         ...    
335   2019-12-01
336   2020-01-01
337   2020-02-01
338   2020-03-01
339   2020-04-01
Name: DATE, Length: 340, dtype: datetime64[ns]

In [62]:
sales[ 'DATE'][0].year

1992

In [63]:
sales = pd.read_csv('RetailSales_BeerWineLiquor.csv', parse_dates=[0])
sales['DATE']

0     1992-01-01
1     1992-02-01
2     1992-03-01
3     1992-04-01
4     1992-05-01
         ...    
335   2019-12-01
336   2020-01-01
337   2020-02-01
338   2020-03-01
339   2020-04-01
Name: DATE, Length: 340, dtype: datetime64[ns]

In [64]:
sales = sales.set_index('DATE')
sales.resample(rule='A').mean()

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-12-31,1807.25
1993-12-31,1794.833333
1994-12-31,1841.75
1995-12-31,1833.916667
1996-12-31,1929.75
1997-12-31,2006.75
1998-12-31,2115.166667
1999-12-31,2206.333333
2000-12-31,2375.583333
2001-12-31,2468.416667


In [65]:
sales = pd.read_csv('RetailSales_BeerWineLiquor.csv', parse_dates=[0])
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           340 non-null    datetime64[ns]
 1   MRTSSM4453USN  340 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 5.4 KB


In [66]:
sales['DATE'].dt.year

0      1992
1      1992
2      1992
3      1992
4      1992
       ... 
335    2019
336    2020
337    2020
338    2020
339    2020
Name: DATE, Length: 340, dtype: int64