In [89]:
# import dependencies
import pandas as pd
import os
import json
from sqlalchemy import create_engine
from collections import defaultdict

In [90]:
# create a connection to the database
engine = create_engine('postgresql://postgres:Analytics20@localhost:5432/satellite_db')
con = engine.connect()

# if table exists replace
if_exists_param = 'replace'

In [91]:
# Extract CSVs into DataFrames
file = "original_dataset/UCS-Satellite-Database-4-1-2020.xls"
path = "../data"

satellite_unedited = pd.read_excel(os.path.join(path, file), encoding='utf-8')

In [42]:
satellite_df = satellite_unedited[["Name of Satellite, Alternate Names", "Country of Operator/Owner",
                                  "Operator/Owner", "Users", "Purpose", "Class of Orbit", "Date of Launch",
                                  "Contractor", "Country of Contractor", "Launch Site", "Launch Vehicle",
                                  "NORAD Number"]]

In [43]:
renamed_columns = { "Name of Satellite, Alternate Names": "satellite_name", "Country of Operator/Owner": "country_of_owner",
                   "Operator/Owner": "owner_operator", "Users": "user", "Class of Orbit": "class_of_orbit", "Purpose": "purpose",
                   "Date of Launch": "launch_date", "Country of Contractor": "country_of_contractor", "Launch Vehicle": "launch_vehicle",
         "Launch Site": "launch_location", "Launch Vehicle": "launch_vehicle", "NORAD Number": "NORAD_number",
                   "Contractor": "contractor"
}

In [44]:
satellite_df = satellite_df.rename(columns=renamed_columns)

In [45]:
satellite_df = satellite_df.dropna()
satellite_df.dtypes

satellite_name                   object
country_of_owner                 object
owner_operator                   object
user                             object
purpose                          object
class_of_orbit                   object
launch_date              datetime64[ns]
contractor                       object
country_of_contractor            object
launch_location                  object
launch_vehicle                   object
NORAD_number                    float64
dtype: object

In [60]:

satellite_df['NORAD_number'] = (satellite_df['NORAD_number']).astype(int)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
satellite_df = satellite_df.drop_duplicates(subset=['NORAD_number'], keep='first')


In [48]:
NORAD = satellite_df['NORAD_number']
satellite_df.drop(labels=['NORAD_number'], axis=1,inplace = True)
satellite_df.insert(0, 'NORAD_number', NORAD)


In [55]:
satellite_df['user_codes'] = satellite_df['user'].astype('category').cat.codes
satellite_df['purpose_codes'] = satellite_df['purpose'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [56]:
u_code = satellite_df['user'].astype('category')
p_code = satellite_df['purpose'].astype('category')

user_legend = dict(enumerate(u_code.cat.categories))
purpose_legend = dict(enumerate(p_code.cat.categories))
print(user_legend)
print(purpose_legend)

{0: 'Civil', 1: 'Civil/Government', 2: 'Civil/Military', 3: 'Commercial', 4: 'Commercial ', 5: 'Commercial/Civil', 6: 'Commercial/Government', 7: 'Commercial/Military', 8: 'Government', 9: 'Government/Civil', 10: 'Government/Commercial', 11: 'Government/Commercial/Military', 12: 'Government/Military', 13: 'Military', 14: 'Military ', 15: 'Military/Civil', 16: 'Military/Commercial', 17: 'Military/Government'}
{0: 'Communications', 1: 'Communications/Maritime Tracking', 2: 'Communications/Navigation', 3: 'Communications/Technology Development', 4: 'Earth Observation', 5: 'Earth Observation ', 6: 'Earth Observation/Communications', 7: 'Earth Observation/Communications/Space Science', 8: 'Earth Observation/Earth Science', 9: 'Earth Observation/Space Science', 10: 'Earth Observation/Technology Development', 11: 'Earth Science', 12: 'Earth Science/Earth Observation', 13: 'Earth/Space Observation', 14: 'Educational', 15: 'Mission Extension Technology', 16: 'Navigation/Global Positioning', 17:

In [54]:

satellite_df['user'].value_counts()


Commercial                        1432
Government                         435
Military                           337
Civil                              131
Government/Commercial              114
Military/Commercial                 77
Military/Government                 53
Government/Civil                    43
Commercial/Civil                    11
Military/Civil                       6
Government/Military                  5
Commercial/Military                  2
Civil/Government                     2
Commercial                           1
Military                             1
Government/Commercial/Military       1
Commercial/Government                1
Civil/Military                       1
Name: user, dtype: int64

In [57]:
satellite_df['purpose'].value_counts()

Communications                                    1193
Earth Observation                                  862
Technology Development                             291
Navigation/Global Positioning                      134
Space Science                                       84
Earth Science                                       15
Navigation/Regional Positioning                     12
Technology Demonstration                             9
Communications/Technology Development                8
Space Observation                                    8
Earth Observation/Technology Development             7
Earth Observation                                    5
Communications/Maritime Tracking                     5
Surveillance                                         3
Earth/Space Observation                              2
Educational                                          2
Technology Development/Education                     2
Earth Observation/Communications                     2
Earth Obse

In [61]:
satellite_df['launch_date'] = pd.to_datetime(satellite_df['launch_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [76]:
satellite_df['year'] = pd.DatetimeIndex(satellite_df['launch_date']).year


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [120]:


user_by_year = satellite_df.groupby(['year', 'user']).size()
purpose_by_year = satellite_df.groupby(['year', 'purpose']).size()



year  purpose                      
1974  Communications                    1
1988  Communications                    1
1989  Communications                    1
1990  Communications                    1
      Space Science                     1
                                       ..
2020  Earth Observation                 8
      Navigation/Global Positioning     2
      Space Science                     1
      Technology Developmen             1
      Technology Development           15
Length: 191, dtype: int64

In [125]:
idx = pd.MultiIndex.from_product((satellite_df.year.unique(), satellite_df.user.unique()))
idx2 = pd.MultiIndex.from_product((satellite_df.year.unique(), satellite_df.purpose.unique()))

In [129]:
r = user_by_year.reindex(idx).fillna(0)
r2 = purpose_by_year.reindex(idx2).fillna(0)

In [130]:
user_year = r.reset_index() \
          .groupby('level_0').apply(lambda x: dict(zip(x['level_1'],x[0]))) \
          .to_dict()

purpose_year = r2.reset_index() \
          .groupby('level_0').apply(lambda x: dict(zip(x['level_1'],x[0]))) \
          .to_dict()

In [118]:
with open('user_year.json', 'w') as fp:
    json.dump(user_year, fp)

In [131]:
with open('purpose_year.json', 'w') as fp:
    json.dump(purpose_year, fp)

In [80]:
satellite_df.to_json('file.json', orient='records', lines=True)
user_by_year.to_json('user_year.json', orient='records')

In [55]:
satellite_df.to_csv('satellite.csv', index=False)

In [38]:
satellite = satellite_df[["NORAD_number", "satellite_name", "user", "purpose", "class_of_orbit"]]
owner = satellite_df[["NORAD_number", "owner_operator", "country_of_owner", "contractor", "country_of_contractor"]]
launch = satellite_df[["NORAD_number", "launch_location", "launch_date", "launch_vehicle"]]

In [39]:
satellite = satellite.set_index("NORAD_number")

LOAD TABLES INTO Postgres

In [41]:
satellite.to_sql(name='satellite', con=engine, if_exists=if_exists_param, index=True)


In [42]:
owner.to_sql(name='owner', con=engine, if_exists=if_exists_param, index=False)
launch.to_sql(name='launch', con=engine, if_exists=if_exists_param, index=False)