In [16]:
import pandas as pd
import numpy as np
import datetime as dt

# 1. Setup

### 1.1 Import Data

In [9]:
#need to remove empty column at end
df = pd.read_csv('Resources/ufo_data.csv', on_bad_lines='skip', low_memory=False)
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88679 entries, 0 to 88678
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              88679 non-null  object 
 1   city                  88679 non-null  object 
 2   state                 81270 non-null  object 
 3   country               76314 non-null  object 
 4   shape                 85757 non-null  object 
 5   duration (seconds)    88677 non-null  object 
 6   duration (hours/min)  85660 non-null  object 
 7   comments              88644 non-null  object 
 8   date posted           88679 non-null  object 
 9   latitude              88679 non-null  object 
 10  longitude             88679 non-null  float64
dtypes: float64(1), object(10)
memory usage: 7.4+ MB


# 2. Data Cleaning

### 2.1 Remove Unwanted Columns

In [44]:
df2 = df.drop(['duration (hours/min)', 'comments', 'date posted'], axis=1)
df2 = df2.dropna()
df2.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,29.8830556,-97.941111
3,10/10/1956 21:00,edna,tx,us,circle,20,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,21.4180556,-157.803611
5,10/10/1961 19:00,bristol,tn,us,sphere,300,36.595,-82.188889
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,41.1175,-73.408333


### 2.2 Update DataTypes

In [45]:
df2['datetime'] = pd.to_datetime(df2.datetime, errors='coerce')

In [46]:
df2 = df2.astype({'duration (seconds)': str}, errors='raise')
df2['duration (seconds)'] = df2['duration (seconds)'].str.extract('(\d+)')
df2 = df2.astype({'duration (seconds)': int}, errors='raise')
df2.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700,29.8830556,-97.941111
3,1956-10-10 21:00:00,edna,tx,us,circle,20,28.9783333,-96.645833
4,1960-10-10 20:00:00,kaneohe,hi,us,light,900,21.4180556,-157.803611
5,1961-10-10 19:00:00,bristol,tn,us,sphere,300,36.595,-82.188889
7,1965-10-10 23:45:00,norwalk,ct,us,disk,1200,41.1175,-73.408333


In [47]:
df2 = df2.astype({'latitude': float}, errors='raise')
df2.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700,29.883056,-97.941111
3,1956-10-10 21:00:00,edna,tx,us,circle,20,28.978333,-96.645833
4,1960-10-10 20:00:00,kaneohe,hi,us,light,900,21.418056,-157.803611
5,1961-10-10 19:00:00,bristol,tn,us,sphere,300,36.595,-82.188889
7,1965-10-10 23:45:00,norwalk,ct,us,disk,1200,41.1175,-73.408333


In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71278 entries, 0 to 88678
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetime            70615 non-null  datetime64[ns]
 1   city                71278 non-null  object        
 2   state               71278 non-null  object        
 3   country             71278 non-null  object        
 4   shape               71278 non-null  object        
 5   duration (seconds)  71278 non-null  int64         
 6   latitude            71278 non-null  float64       
 7   longitude           71278 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 4.9+ MB


### 2.3 Clean Data

In [52]:
df3 = df2.dropna()
print(df3.shape)

(70615, 8)


In [54]:
df3 = df3.drop_duplicates()
print(df3.shape)

(70219, 8)


In [56]:
df3['state'] = df3['state'].str.upper()
df3['country'] = df3['country'].str.upper()
df3['city'] = df3['city'].str.title()
df3.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,1949-10-10 20:30:00,San Marcos,TX,US,cylinder,2700,29.883056,-97.941111
3,1956-10-10 21:00:00,Edna,TX,US,circle,20,28.978333,-96.645833
4,1960-10-10 20:00:00,Kaneohe,HI,US,light,900,21.418056,-157.803611
5,1961-10-10 19:00:00,Bristol,TN,US,sphere,300,36.595,-82.188889
7,1965-10-10 23:45:00,Norwalk,CT,US,disk,1200,41.1175,-73.408333


### 2.4 Only US Locations

In [57]:
df3['country'].unique()

array(['US', 'CA', 'AU', 'GB'], dtype=object)

In [60]:
df4 = df3.loc[df3['country']=='US']
df4 = df4.reset_index()
print(df4.shape)

(67053, 9)


In [61]:
df4['state'].unique()

array(['TX', 'HI', 'TN', 'CT', 'AL', 'FL', 'CA', 'NC', 'NY', 'KY', 'MI',
       'MA', 'KS', 'SC', 'WA', 'CO', 'NH', 'WI', 'ME', 'GA', 'PA', 'IL',
       'AR', 'MO', 'OH', 'IN', 'AZ', 'MN', 'NV', 'NE', 'OR', 'IA', 'VA',
       'ID', 'NM', 'NJ', 'WV', 'OK', 'AK', 'RI', 'VT', 'LA', 'ND', 'PR',
       'MS', 'UT', 'MD', 'MT', 'WY', 'SD', 'DE', 'DC'], dtype=object)

### 2.5 Convert to JSON

In [62]:
df4.to_json('Resources/ufo_data.json', orient='index')