In [1]:
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta

# Pandas Datetime [25 exercises with solution]

In [32]:
df = pd.read_csv('data/ufo_sighting_data.csv', 
                 parse_dates=[0, 8], 
                 dtype={'length_of_encounter_seconds': 'str',
                        'latitude': 'str'})
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head(5)

Unnamed: 0,date_time,city,state/province,country,ufo_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_time                        80332 non-null  object        
 1   city                             80332 non-null  object        
 2   state/province                   74535 non-null  object        
 3   country                          70662 non-null  object        
 4   ufo_shape                        78400 non-null  object        
 5   length_of_encounter_seconds      80332 non-null  object        
 6   described_duration_of_encounter  80332 non-null  object        
 7   description                      80317 non-null  object        
 8   date_documented                  80332 non-null  datetime64[ns]
 9   latitude                         80332 non-null  object        
 10  longitude                        80332 non-null  float64  

Наблюдаем проблемы: 
1. Варнинг по колонкам 5, 9. Нужно корректно разобраться в ситуации.
2. Поле date_time не преобразовалось в тип даты. Нужно исследовать в чем проблема.

**Кейс2**
Попробуем явно преобразовать поле date_time. Получаем ошибку, из которой следует, что есть записи, у которых часы указаны некорректно: 24:00. Способ решения: выбрать записи с некорректным значением поля и преобразовать у них время на 0 часов следующего дня.

In [4]:
df.date_time.apply(lambda x: pd.to_datetime(x))

ParserError: hour must be in 0..23: 10/11/2006 24:00 present at position 0

Находим 694 строки с некорректным временем.

In [5]:
df.query('date_time.str.contains("24:00")')

Unnamed: 0,date_time,city,state/province,country,ufo_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
388,10/11/2006 24:00,rome,ny,us,oval,120,a min or two,I was walking from the garage to the house&#44...,2007-02-01,43.2127778,-75.456111
693,10/1/2001 24:00,chulucanas-piura la vieja (peru),,,other,6312000,2 years,go to: http://www.24horas.com.pe/data/videos/...,2003-03-04,-5.129547,-80.120569
962,10/1/2012 24:00,novi,mi,us,triangle,300,5 minutes,V shaped and 8 big and very brite lights&#44mo...,2012-10-30,42.4805556,-83.475556
1067,10/12/2003 24:00,salatiga (indonesia),,,disk,22,22 seconds,UFO in Salatiga&#44Indonesia,2003-10-31,-7.33683,110.498817
1221,10/12/2013 24:00,cincinnati,oh,us,fireball,300,3-5 minutes,A bright orange light split into four&#44 did ...,2013-10-14,39.1619444,-84.456944
...,...,...,...,...,...,...,...,...,...,...,...
79136,9/4/2005 24:00,boonville,nc,us,cigar,10800.0,3+ hours,Unusual activity in our small community&#44 cr...,2013-09-09,36.2325,-80.708333
79137,9/4/2005 24:00,redondo beach,ca,us,triangle,300.0,5 minutes,Triangulat UFO seen over Redondo Beach&#44 Cal...,2005-12-16,33.849167,-118.387500
79740,9/7/2002 24:00,portsmouth,oh,us,changing,900.0,15 minutes,A bright red light hoovering in the sky.,2002-09-13,38.731667,-82.997778
79759,9/7/2004 24:00,montrose,co,us,unknown,1200.0,20 minutes,Stationary object with three pulsating lights ...,2004-09-09,38.478333,-107.875556


Дополнительно проверим в каком формате указана дата в этом поле. Получаем неудобный для восприятия формат: m/d/Y.

In [6]:
df[['date', 'time']] = df.date_time.str.split(' ', expand=True)
df[['date1', 'date2', 'year']] = df.date.str.split('/', expand=True)

In [7]:
df[['date1', 'date2']].describe()

Unnamed: 0,date1,date2
count,80332,80332
unique,12,31
top,7,15
freq,9520,5968


Преобразуем поле data в тип даты в более удобном формате.

In [8]:
df['date'] = df.date.apply(lambda x: dt.strptime(x, '%m/%d/%Y'))
df.date

0       1949-10-10
1       1949-10-10
2       1955-10-10
3       1956-10-10
4       1960-10-10
           ...    
80327   2013-09-09
80328   2013-09-09
80329   2013-09-09
80330   2013-09-09
80331   2013-09-09
Name: date, Length: 80332, dtype: datetime64[ns]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_time                        80332 non-null  object        
 1   city                             80332 non-null  object        
 2   state/province                   74535 non-null  object        
 3   country                          70662 non-null  object        
 4   ufo_shape                        78400 non-null  object        
 5   length_of_encounter_seconds      80332 non-null  object        
 6   described_duration_of_encounter  80332 non-null  object        
 7   description                      80317 non-null  object        
 8   date_documented                  80332 non-null  datetime64[ns]
 9   latitude                         80332 non-null  object        
 10  longitude                        80332 non-null  float64  

Выполним необходимое преобразование для строк с некорректным временем (24:00): к дате добавим один день, а время заменим на 0:00.

In [10]:
idx = df.query('date_time.str.contains("24:00")').index.to_list()
df.iloc[idx] = df.loc[idx].assign(
                            date=lambda x: x.date + timedelta(days=1),
                            time='00:00')

In [11]:
df.query('date_time.str.contains("24:00")')[['date_time', 'date', 'time']]

Unnamed: 0,date_time,date,time
388,10/11/2006 24:00,2006-10-12,00:00
693,10/1/2001 24:00,2001-10-02,00:00
962,10/1/2012 24:00,2012-10-02,00:00
1067,10/12/2003 24:00,2003-10-13,00:00
1221,10/12/2013 24:00,2013-10-13,00:00
...,...,...,...
79136,9/4/2005 24:00,2005-09-05,00:00
79137,9/4/2005 24:00,2005-09-05,00:00
79740,9/7/2002 24:00,2002-09-08,00:00
79759,9/7/2004 24:00,2004-09-08,00:00


Добавим новую колонку datetime, объединяющую дату и время записи и сделаем ее индексом.

In [12]:
df = df.assign(datetime = lambda x: x.date + pd.to_timedelta(x.time + ':00'))
df.set_index('datetime', inplace=True)
df

Unnamed: 0_level_0,date_time,city,state/province,country,ufo_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude,date,time,date1,date2,year
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1949-10-10 20:30:00,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111,1949-10-10,20:30,10,10,1949
1949-10-10 21:00:00,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082,1949-10-10,21:00,10,10,1949
1955-10-10 17:00:00,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667,1955-10-10,17:00,10,10,1955
1956-10-10 21:00:00,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833,1956-10-10,21:00,10,10,1956
1960-10-10 20:00:00,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611,1960-10-10,20:00,10,10,1960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-09-09 21:15:00,9/9/2013 21:15,nashville,tn,us,light,600.0,10 minutes,Round from the distance/slowly changing colors...,2013-09-30,36.165833,-86.784444,2013-09-09,21:15,9,9,2013
2013-09-09 22:00:00,9/9/2013 22:00,boise,id,us,circle,1200.0,20 minutes,Boise&#44 ID&#44 spherical&#44 20 min&#44 10 r...,2013-09-30,43.613611,-116.202500,2013-09-09,22:00,9,9,2013
2013-09-09 22:00:00,9/9/2013 22:00,napa,ca,us,other,1200.0,hour,Napa UFO&#44,2013-09-30,38.297222,-122.284444,2013-09-09,22:00,9,9,2013
2013-09-09 22:20:00,9/9/2013 22:20,vienna,va,us,circle,5.0,5 seconds,Saw a five gold lit cicular craft moving fastl...,2013-09-30,38.901111,-77.265556,2013-09-09,22:20,9,9,2013


Удалим вспомогательные колонки.

In [13]:
df.drop(['date', 'time', 'date1', 'date2', 'year'], axis=1, inplace=True)
df

Unnamed: 0_level_0,date_time,city,state/province,country,ufo_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1949-10-10 20:30:00,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111
1949-10-10 21:00:00,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082
1955-10-10 17:00:00,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667
1956-10-10 21:00:00,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833
1960-10-10 20:00:00,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611
...,...,...,...,...,...,...,...,...,...,...,...
2013-09-09 21:15:00,9/9/2013 21:15,nashville,tn,us,light,600.0,10 minutes,Round from the distance/slowly changing colors...,2013-09-30,36.165833,-86.784444
2013-09-09 22:00:00,9/9/2013 22:00,boise,id,us,circle,1200.0,20 minutes,Boise&#44 ID&#44 spherical&#44 20 min&#44 10 r...,2013-09-30,43.613611,-116.202500
2013-09-09 22:00:00,9/9/2013 22:00,napa,ca,us,other,1200.0,hour,Napa UFO&#44,2013-09-30,38.297222,-122.284444
2013-09-09 22:20:00,9/9/2013 22:20,vienna,va,us,circle,5.0,5 seconds,Saw a five gold lit cicular craft moving fastl...,2013-09-30,38.901111,-77.265556


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80332 entries, 1949-10-10 20:30:00 to 2013-09-09 23:00:00
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   date_time                        80332 non-null  object        
 1   city                             80332 non-null  object        
 2   state/province                   74535 non-null  object        
 3   country                          70662 non-null  object        
 4   ufo_shape                        78400 non-null  object        
 5   length_of_encounter_seconds      80332 non-null  object        
 6   described_duration_of_encounter  80332 non-null  object        
 7   description                      80317 non-null  object        
 8   date_documented                  80332 non-null  datetime64[ns]
 9   latitude                         80332 non-null  object        
 10  longitude              

In [33]:
df.length_of_encounter_seconds.describe()

count     80332
unique      536
top         300
freq       8635
Name: length_of_encounter_seconds, dtype: object

In [34]:
pd.to_numeric(df.latitude)

ValueError: Unable to parse string "33q.200088" at position 43782

In [41]:
pd.to_numeric(df.length_of_encounter_seconds)

ValueError: Unable to parse string "2`" at position 27822

In [43]:
df.length_of_encounter_seconds.replace(to_replace=r'\.*(`)', value='', inplace=True)

In [44]:
df.query('length_of_encounter_seconds.str.contains("`")')

Unnamed: 0,date_time,city,state/province,country,ufo_shape,length_of_encounter_seconds,described_duration_of_encounter,description,date_documented,latitude,longitude
27822,2/2/2000 19:33,bouse,az,us,,2`,each a few seconds,Driving through Plomosa Pass towards Bouse Loo...,2000-02-16,33.9325,-114.005
35692,4/10/2005 22:52,santa cruz,ca,us,,8`,eight seconds,2 red lights moving together and apart with a ...,2005-04-16,36.9741667,-122.029722
58591,7/21/2006 13:00,ibague (colombia),,,circle,0.5`,1/2 segundo,Viajaba a 27.000 pies en un avion comercial ve...,2006-10-30,4.440663,-75.244141
