In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# 1. Setup

### 1.1 Import Data

In [2]:
#need to remove empty column at end
df = pd.read_csv('Resources/ufo_data.csv', on_bad_lines='skip', low_memory=False)
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88679 entries, 0 to 88678
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              88679 non-null  object 
 1   city                  88679 non-null  object 
 2   state                 81270 non-null  object 
 3   country               76314 non-null  object 
 4   shape                 85757 non-null  object 
 5   duration (seconds)    88677 non-null  object 
 6   duration (hours/min)  85660 non-null  object 
 7   comments              88644 non-null  object 
 8   date posted           88679 non-null  object 
 9   latitude              88679 non-null  object 
 10  longitude             88679 non-null  float64
dtypes: float64(1), object(10)
memory usage: 7.4+ MB


# 2. Data Cleaning

### 2.1 Remove Unwanted Columns

In [4]:
df2 = df.drop(['duration (hours/min)', 'comments', 'date posted'], axis=1)
df2 = df2.dropna()
df2.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,29.8830556,-97.941111
3,10/10/1956 21:00,edna,tx,us,circle,20,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,21.4180556,-157.803611
5,10/10/1961 19:00,bristol,tn,us,sphere,300,36.595,-82.188889
7,10/10/1965 23:45,norwalk,ct,us,disk,1200,41.1175,-73.408333


### 2.2 Update DataTypes

In [5]:
df2['datetime'] = pd.to_datetime(df2.datetime, errors='coerce')

In [6]:
df2 = df2.astype({'duration (seconds)': str}, errors='raise')
df2['duration (seconds)'] = df2['duration (seconds)'].str.extract('(\d+)')
df2 = df2.astype({'duration (seconds)': int}, errors='raise')
df2.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700,29.8830556,-97.941111
3,1956-10-10 21:00:00,edna,tx,us,circle,20,28.9783333,-96.645833
4,1960-10-10 20:00:00,kaneohe,hi,us,light,900,21.4180556,-157.803611
5,1961-10-10 19:00:00,bristol,tn,us,sphere,300,36.595,-82.188889
7,1965-10-10 23:45:00,norwalk,ct,us,disk,1200,41.1175,-73.408333


In [7]:
df2 = df2.astype({'latitude': float}, errors='raise')
df2.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),latitude,longitude
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700,29.883056,-97.941111
3,1956-10-10 21:00:00,edna,tx,us,circle,20,28.978333,-96.645833
4,1960-10-10 20:00:00,kaneohe,hi,us,light,900,21.418056,-157.803611
5,1961-10-10 19:00:00,bristol,tn,us,sphere,300,36.595,-82.188889
7,1965-10-10 23:45:00,norwalk,ct,us,disk,1200,41.1175,-73.408333


In [8]:
df2 = df2.rename(columns={'latitude':'ufo_latitude', 'longitude':'ufo_longitude'})

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71278 entries, 0 to 88678
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetime            70615 non-null  datetime64[ns]
 1   city                71278 non-null  object        
 2   state               71278 non-null  object        
 3   country             71278 non-null  object        
 4   shape               71278 non-null  object        
 5   duration (seconds)  71278 non-null  int64         
 6   ufo_latitude        71278 non-null  float64       
 7   ufo_longitude       71278 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 4.9+ MB


### 2.3 Clean Data

In [10]:
df3 = df2.dropna()
print(df3.shape)

(70615, 8)


In [11]:
df3 = df3.drop_duplicates()
print(df3.shape)

(70219, 8)


In [12]:
df3['state'] = df3['state'].str.upper()
df3['country'] = df3['country'].str.upper()
df3['city'] = df3['city'].str.title()
df3.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),ufo_latitude,ufo_longitude
0,1949-10-10 20:30:00,San Marcos,TX,US,cylinder,2700,29.883056,-97.941111
3,1956-10-10 21:00:00,Edna,TX,US,circle,20,28.978333,-96.645833
4,1960-10-10 20:00:00,Kaneohe,HI,US,light,900,21.418056,-157.803611
5,1961-10-10 19:00:00,Bristol,TN,US,sphere,300,36.595,-82.188889
7,1965-10-10 23:45:00,Norwalk,CT,US,disk,1200,41.1175,-73.408333


### 2.4 Only US Locations

In [13]:
df3['country'].unique()

array(['US', 'CA', 'AU', 'GB'], dtype=object)

In [29]:
df4 = df3.loc[df3['country']=='US']
df4 = df4.reset_index(drop=True)
print(df4.shape)

(67053, 8)


In [30]:
df4['state'].unique()

array(['TX', 'HI', 'TN', 'CT', 'AL', 'FL', 'CA', 'NC', 'NY', 'KY', 'MI',
       'MA', 'KS', 'SC', 'WA', 'CO', 'NH', 'WI', 'ME', 'GA', 'PA', 'IL',
       'AR', 'MO', 'OH', 'IN', 'AZ', 'MN', 'NV', 'NE', 'OR', 'IA', 'VA',
       'ID', 'NM', 'NJ', 'WV', 'OK', 'AK', 'RI', 'VT', 'LA', 'ND', 'PR',
       'MS', 'UT', 'MD', 'MT', 'WY', 'SD', 'DE', 'DC'], dtype=object)

In [31]:
df4['country'] = 'United States'
df4.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),ufo_latitude,ufo_longitude
0,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111
1,1956-10-10 21:00:00,Edna,TX,United States,circle,20,28.978333,-96.645833
2,1960-10-10 20:00:00,Kaneohe,HI,United States,light,900,21.418056,-157.803611
3,1961-10-10 19:00:00,Bristol,TN,United States,sphere,300,36.595,-82.188889
4,1965-10-10 23:45:00,Norwalk,CT,United States,disk,1200,41.1175,-73.408333


In [32]:
df5 = df4.copy()
df5['year'] = pd.DatetimeIndex(df5['datetime']).year
df5.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),ufo_latitude,ufo_longitude,year
0,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949
1,1956-10-10 21:00:00,Edna,TX,United States,circle,20,28.978333,-96.645833,1956
2,1960-10-10 20:00:00,Kaneohe,HI,United States,light,900,21.418056,-157.803611,1960
3,1961-10-10 19:00:00,Bristol,TN,United States,sphere,300,36.595,-82.188889,1961
4,1965-10-10 23:45:00,Norwalk,CT,United States,disk,1200,41.1175,-73.408333,1965


### 2.5 Convert to JSON and CSV

In [33]:
df5.to_json('Resources/ufo_data.json', orient='index')

In [34]:
df5.to_csv('Resources/ufo_data.csv')

# 3. Military Base Info

### 3.1 Load Data

In [35]:
df_mb = pd.read_json('Resources/military-bases.json')
df_mb.head()

Unnamed: 0,geo_point_2d,geo_shape,objectid_1,objectid,component,site_name,joint_base,state_terr,country,oper_stat,perimeter,area,shape_leng,shape_area
0,"{'lon': -85.6506347178, 'lat': 31.2309993833}","{'type': 'Feature', 'geometry': {'coordinates'...",26,65,Army Active,Allen Stagefield AL,,Alabama,United States,Active,1.641383,0.176575,3170.633316,627424.0
1,"{'lon': -85.6497984957, 'lat': 31.8157331822}","{'type': 'Feature', 'geometry': {'coordinates'...",33,73,Army Active,Louisville Stagefield AL,,Alabama,United States,Active,1.72338,0.162357,3357.487241,584096.8
2,"{'lon': -106.425696182, 'lat': 33.1594636742}","{'type': 'Feature', 'geometry': {'coordinates'...",66,261,Army Active,White Sands Missile Range NM,,New Mexico,United States,Active,332.133189,3548.570164,648984.100372,13150790000.0
3,"{'lon': -76.3043760544, 'lat': 37.0130203962}","{'type': 'Feature', 'geometry': {'coordinates'...",114,899,Army Active,Fort Monroe,,Virginia,United States,Inactive,10.209688,0.877233,21033.880109,3570033.0
4,"{'lon': -157.905641308, 'lat': 21.3866284869}","{'type': 'Feature', 'geometry': {'coordinates'...",161,1237,MC Active,MCB Camp Smith,,Hawaii,United States,Active,2.931885,0.331246,5098.778436,994400.4


In [36]:
df_mb2 = df_mb[['geo_point_2d', 'geo_shape', 'component', 'site_name', 'state_terr', 'country', 'oper_stat']]

In [37]:
df_mb2 = df_mb2.copy()
df_mb2 = df_mb2.rename(columns={'state_terr':'state'})

In [38]:
df_mb2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   geo_point_2d  776 non-null    object
 1   geo_shape     776 non-null    object
 2   component     776 non-null    object
 3   site_name     776 non-null    object
 4   state         776 non-null    object
 5   country       776 non-null    object
 6   oper_stat     776 non-null    object
dtypes: object(7)
memory usage: 42.6+ KB


### 3.2 Extract Coordinate Data

In [39]:
df_mb2['geo_point_2d'][0]

{'lon': -85.6506347178, 'lat': 31.2309993833}

In [40]:
df_mb2['geo_shape'][0]['geometry']['coordinates']

[[[-85.65462565497243, 31.234178331412515],
  [-85.65280405303592, 31.2350202819558],
  [-85.65101108759404, 31.233844887000856],
  [-85.64624105229998, 31.231687174187147],
  [-85.6461603431633, 31.227908115185613],
  [-85.64979400602967, 31.22793346599952],
  [-85.65055293422273, 31.22819845388161],
  [-85.65114885861591, 31.228358922650713],
  [-85.65448605767558, 31.229254906742018],
  [-85.65462565497243, 31.234178331412515]]]

In [41]:
x = 0
df_mb2['longitude'] = ''
df_mb2['latitude'] = ''
df_mb2['full_area'] = ''
while x < len(df_mb2):
    df_mb2['longitude'][x] = df_mb2['geo_point_2d'][x]['lon']
    df_mb2['latitude'][x] = df_mb2['geo_point_2d'][x]['lat']
    df_mb2['full_area'][x] = df_mb2['geo_shape'][x]['geometry']['coordinates']
    x+=1

In [42]:
df_mb2.head()

Unnamed: 0,geo_point_2d,geo_shape,component,site_name,state,country,oper_stat,longitude,latitude,full_area
0,"{'lon': -85.6506347178, 'lat': 31.2309993833}","{'type': 'Feature', 'geometry': {'coordinates'...",Army Active,Allen Stagefield AL,Alabama,United States,Active,-85.650635,31.230999,"[[[-85.65462565497243, 31.234178331412515], [-..."
1,"{'lon': -85.6497984957, 'lat': 31.8157331822}","{'type': 'Feature', 'geometry': {'coordinates'...",Army Active,Louisville Stagefield AL,Alabama,United States,Active,-85.649798,31.815733,"[[[-85.65268851262239, 31.812802192409293], [-..."
2,"{'lon': -106.425696182, 'lat': 33.1594636742}","{'type': 'Feature', 'geometry': {'coordinates'...",Army Active,White Sands Missile Range NM,New Mexico,United States,Active,-106.425696,33.159464,"[[[-106.27973443186896, 33.91098413413733], [-..."
3,"{'lon': -76.3043760544, 'lat': 37.0130203962}","{'type': 'Feature', 'geometry': {'coordinates'...",Army Active,Fort Monroe,Virginia,United States,Inactive,-76.304376,37.01302,"[[[[-76.29312038151345, 37.032907829086405], [..."
4,"{'lon': -157.905641308, 'lat': 21.3866284869}","{'type': 'Feature', 'geometry': {'coordinates'...",MC Active,MCB Camp Smith,Hawaii,United States,Active,-157.905641,21.386628,"[[[-157.8989279212737, 21.392721693901013], [-..."


### 3.3 Drop Unwanted Columns

In [43]:
df_mb3 = df_mb2.drop(['geo_point_2d', 'geo_shape'], axis=1)
df_mb3.head()

Unnamed: 0,component,site_name,state,country,oper_stat,longitude,latitude,full_area
0,Army Active,Allen Stagefield AL,Alabama,United States,Active,-85.650635,31.230999,"[[[-85.65462565497243, 31.234178331412515], [-..."
1,Army Active,Louisville Stagefield AL,Alabama,United States,Active,-85.649798,31.815733,"[[[-85.65268851262239, 31.812802192409293], [-..."
2,Army Active,White Sands Missile Range NM,New Mexico,United States,Active,-106.425696,33.159464,"[[[-106.27973443186896, 33.91098413413733], [-..."
3,Army Active,Fort Monroe,Virginia,United States,Inactive,-76.304376,37.01302,"[[[[-76.29312038151345, 37.032907829086405], [..."
4,MC Active,MCB Camp Smith,Hawaii,United States,Active,-157.905641,21.386628,"[[[-157.8989279212737, 21.392721693901013], [-..."


### 3.4 Change Data Types

In [44]:
df_mb3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   component  776 non-null    object
 1   site_name  776 non-null    object
 2   state      776 non-null    object
 3   country    776 non-null    object
 4   oper_stat  776 non-null    object
 5   longitude  776 non-null    object
 6   latitude   776 non-null    object
 7   full_area  776 non-null    object
dtypes: object(8)
memory usage: 48.6+ KB


In [45]:
df_mb3 = df_mb3.astype({'latitude': float}, errors='raise')
df_mb3 = df_mb3.astype({'longitude': float}, errors='raise')
df_mb3.head()

Unnamed: 0,component,site_name,state,country,oper_stat,longitude,latitude,full_area
0,Army Active,Allen Stagefield AL,Alabama,United States,Active,-85.650635,31.230999,"[[[-85.65462565497243, 31.234178331412515], [-..."
1,Army Active,Louisville Stagefield AL,Alabama,United States,Active,-85.649798,31.815733,"[[[-85.65268851262239, 31.812802192409293], [-..."
2,Army Active,White Sands Missile Range NM,New Mexico,United States,Active,-106.425696,33.159464,"[[[-106.27973443186896, 33.91098413413733], [-..."
3,Army Active,Fort Monroe,Virginia,United States,Inactive,-76.304376,37.01302,"[[[[-76.29312038151345, 37.032907829086405], [..."
4,MC Active,MCB Camp Smith,Hawaii,United States,Active,-157.905641,21.386628,"[[[-157.8989279212737, 21.392721693901013], [-..."


In [46]:
df_mb3 = df_mb3.rename(columns={'latitude':'base_latitude', 'longitude':'base_longitude'})

In [47]:
df_mb3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   component       776 non-null    object 
 1   site_name       776 non-null    object 
 2   state           776 non-null    object 
 3   country         776 non-null    object 
 4   oper_stat       776 non-null    object 
 5   base_longitude  776 non-null    float64
 6   base_latitude   776 non-null    float64
 7   full_area       776 non-null    object 
dtypes: float64(2), object(6)
memory usage: 48.6+ KB


# 4. Distance Between UFO Sighting and Military Base

### 4.1 Join DataFrames

In [52]:
df_full = pd.merge(df5, df_mb3, on='country')
df_full.head()

Unnamed: 0,datetime,city,state_x,country,shape,duration (seconds),ufo_latitude,ufo_longitude,year,component,site_name,state_y,oper_stat,base_longitude,base_latitude,full_area
0,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,Allen Stagefield AL,Alabama,Active,-85.650635,31.230999,"[[[-85.65462565497243, 31.234178331412515], [-..."
1,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,Louisville Stagefield AL,Alabama,Active,-85.649798,31.815733,"[[[-85.65268851262239, 31.812802192409293], [-..."
2,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,White Sands Missile Range NM,New Mexico,Active,-106.425696,33.159464,"[[[-106.27973443186896, 33.91098413413733], [-..."
3,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,Fort Monroe,Virginia,Inactive,-76.304376,37.01302,"[[[[-76.29312038151345, 37.032907829086405], [..."
4,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,MC Active,MCB Camp Smith,Hawaii,Active,-157.905641,21.386628,"[[[-157.8989279212737, 21.392721693901013], [-..."


### 4.2 Calculate Distance

In [53]:
#source: https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas/29546836#29546836
#source: https://dadoverflow.com/2022/01/07/pandas-and-distance-calculations/
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    miles = 3958 * c
    return miles

In [54]:
# distance is in miles
df_full['dist'] = haversine_np(df_full['ufo_latitude'], df_full['ufo_longitude'], df_full['base_latitude'],
                               df_full['base_longitude'])
df_full.head()

Unnamed: 0,datetime,city,state_x,country,shape,duration (seconds),ufo_latitude,ufo_longitude,year,component,site_name,state_y,oper_stat,base_longitude,base_latitude,full_area,dist
0,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,Allen Stagefield AL,Alabama,Active,-85.650635,31.230999,"[[[-85.65462565497243, 31.234178331412515], [-...",848.973817
1,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,Louisville Stagefield AL,Alabama,Active,-85.649798,31.815733,"[[[-85.65268851262239, 31.812802192409293], [-...",848.974653
2,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,White Sands Missile Range NM,New Mexico,Active,-106.425696,33.159464,"[[[-106.27973443186896, 33.91098413413733], [-...",587.826795
3,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,Army Active,Fort Monroe,Virginia,Inactive,-76.304376,37.01302,"[[[[-76.29312038151345, 37.032907829086405], [...",1491.950966
4,1949-10-10 20:30:00,San Marcos,TX,United States,cylinder,2700,29.883056,-97.941111,1949,MC Active,MCB Camp Smith,Hawaii,Active,-157.905641,21.386628,"[[[-157.8989279212737, 21.392721693901013], [-...",4148.777753


In [55]:
print(df_full.shape)

(49887432, 17)


In [56]:
base_trim = df_full[['component', 'site_name', 'state_y', 'oper_stat', 'base_longitude', 'base_latitude', 'dist']]

### 4.3 Closest Base

In [57]:
df_full2 = df_full.groupby(['datetime', 'city', 'state_x', 'country', 'shape', 'duration (seconds)',
                            'ufo_latitude', 'ufo_longitude']).\
                   agg(dist=('dist', np.min)).reset_index()
df_full2.head()

Unnamed: 0,datetime,city,state_x,country,shape,duration (seconds),ufo_latitude,ufo_longitude,dist
0,1910-05-28 21:00:00,Solon,ME,United States,unknown,0,44.949444,-69.858889,35.644882
1,1910-06-01 15:00:00,Wills Point,TX,United States,cigar,120,32.709167,-96.008056,26.714517
2,1920-06-11 21:00:00,Cicero,IN,United States,unknown,60,40.123889,-86.013333,1.454394
3,1925-12-28 18:00:00,Atkinson (6 Miles North Of),IL,United States,disk,60,41.420833,-90.015,0.474506
4,1929-07-05 14:00:00,Buchanan (Or Burns),OR,United States,disk,60,43.6425,-118.6275,92.580839


In [58]:
print(df_full2.shape)

(67053, 9)


In [59]:
df_full2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67053 entries, 0 to 67052
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetime            67053 non-null  datetime64[ns]
 1   city                67053 non-null  object        
 2   state_x             67053 non-null  object        
 3   country             67053 non-null  object        
 4   shape               67053 non-null  object        
 5   duration (seconds)  67053 non-null  int64         
 6   ufo_latitude        67053 non-null  float64       
 7   ufo_longitude       67053 non-null  float64       
 8   dist                67053 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 4.6+ MB


### 4.4 Re-add Base info

In [60]:
df_full3 = pd.merge(df_full2, base_trim, on='dist')
df_full3.head()

Unnamed: 0,datetime,city,state_x,country,shape,duration (seconds),ufo_latitude,ufo_longitude,dist,component,site_name,state_y,oper_stat,base_longitude,base_latitude
0,1910-05-28 21:00:00,Solon,ME,United States,unknown,0,44.949444,-69.858889,35.644882,Army Guard,NG Auburn Training Site,Maine,Active,-70.282798,44.086293
1,1910-05-28 21:00:00,Solon,ME,United States,unknown,0,44.949444,-69.858889,35.644882,Army Guard,NG Auburn Training Site,Maine,Active,-70.282798,44.086293
2,2013-03-22 19:30:00,Solon,ME,United States,fireball,10,44.949444,-69.858889,35.644882,Army Guard,NG Auburn Training Site,Maine,Active,-70.282798,44.086293
3,2013-03-22 19:30:00,Solon,ME,United States,fireball,10,44.949444,-69.858889,35.644882,Army Guard,NG Auburn Training Site,Maine,Active,-70.282798,44.086293
4,1910-06-01 15:00:00,Wills Point,TX,United States,cigar,120,32.709167,-96.008056,26.714517,AF Guard,Tulsa IAP,Oklahoma,Active,-95.874775,36.217024


In [61]:
df_full3 = df_full3.rename(columns={'state_x':'ufo_state', 'state_y':'base_state'})

In [62]:
df_full3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3652865 entries, 0 to 3652864
Data columns (total 15 columns):
 #   Column              Dtype         
---  ------              -----         
 0   datetime            datetime64[ns]
 1   city                object        
 2   ufo_state           object        
 3   country             object        
 4   shape               object        
 5   duration (seconds)  int64         
 6   ufo_latitude        float64       
 7   ufo_longitude       float64       
 8   dist                float64       
 9   component           object        
 10  site_name           object        
 11  base_state          object        
 12  oper_stat           object        
 13  base_longitude      float64       
 14  base_latitude       float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(8)
memory usage: 445.9+ MB


In [63]:
df_full4 = df_full3.drop_duplicates()
print(df_full4.shape)

(67053, 15)


In [64]:
df_full4.to_json('Resources/ufo_to_bases.json', orient='index')