# Dataset: Distributions Data 2016

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import date

In [2]:
#Load the data
distributions_csv= pd.read_csv ("Resources/distributions_data_2016.csv")

In [3]:
distributions_df=pd.DataFrame(distributions_csv)
distributions_df.head()

Unnamed: 0,company,percentage,demographics,job_category
0,anonymous,0.0,Hispanic_or_Latino,Professionals
1,anonymous,0.0,Hispanic_or_Latino,Professionals
2,anonymous,0.8,Hispanic_or_Latino,Professionals
3,anonymous,1.3,Hispanic_or_Latino,Professionals
4,anonymous,1.6,Hispanic_or_Latino,Professionals


# Step 1) Inspect Data

In [4]:
#Identify how many rows and columns in this dataset
distributions_df.shape

(16042, 4)

In [5]:
#Verify Data type in this dataset
distributions_df.dtypes
distributions_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16042 entries, 0 to 16041
Data columns (total 4 columns):
company         16042 non-null object
percentage      16042 non-null float64
demographics    16042 non-null object
job_category    16042 non-null object
dtypes: float64(1), object(3)
memory usage: 3.5 MB


In [6]:
#Identify any N/A or missing data, if 
# diversity_df.isnull().any().any()
distributions_df.isna().sum()

company         0
percentage      0
demographics    0
job_category    0
dtype: int64

# Step 2) Clean and Transform data 

In [7]:
#drop rows with 0.00 percentage, these data doesn't provide meaningful insight 
distributions_df.dropna(axis = 0, subset=['percentage'], how='any', inplace=True)

In [8]:
new_distributions_df= distributions_df[distributions_df['percentage'] != 0]

new_distributions_df.shape

(14692, 4)

# Spot Check each column in detail level

In [9]:
#Columns Names
new_distributions_df.columns

Index(['company', 'percentage', 'demographics', 'job_category'], dtype='object')

In [10]:
#Rename Column Names
new_distributions_df=new_distributions_df.rename(columns={'percentage':'ethnicity_pcn',
                                                  'demographics':'ethnicity'})
new_distributions_df.columns

Index(['company', 'ethnicity_pcn', 'ethnicity', 'job_category'], dtype='object')

In [11]:
#spot check detail values under 'ethnicity' column
new_distributions_df['ethnicity'].unique()

array(['Hispanic_or_Latino', 'White', 'Black_or_African_American',
       'Asian', 'Female_total', 'White_female', 'Asian_female',
       'Black_or_African_American_female', 'Hispanic_or_Latino_female',
       'Women_of_color', 'Underrepresented_minorities_female',
       'People_of_color', 'Underrepresented_minorities'], dtype=object)

In [12]:
# #pivot dataset with unclear ethinicity then inspect
# new_distributions_df[new_distributions_df.ethnicity.isin (['Female_total'
# #                                                   ,'Women_of_color', 
# #                                                   ,'Underrepresented_minorities_female'
# #                                                   ,'People_of_color'
#                                                    ,'Underrepresented_minorities'
#                                                   ])]

In [13]:
#Spot check dataset by company
new_distributions_df['company'].unique()

array(['anonymous', 'eBay', 'Nvidia', 'Uber', 'Square', 'Pinterest',
       'Salesforce', 'PayPal', 'MobileIron', 'Facebook', 'NetApp',
       'Twitter', 'Adobe', 'Google', 'Cisco', 'LinkedIn', 'Apple',
       'Airbnb', 'HPE', 'Intuit', 'Intel', 'HP Inc.', 'Sanmina',
       '23andMe', 'Lyft', 'View'], dtype=object)

In [14]:
new_distributions_df[new_distributions_df.company.isin (['Airbnb'])].head(3)

Unnamed: 0,company,ethnicity_pcn,ethnicity,job_category
107,Airbnb,5.2,Hispanic_or_Latino,Professionals
255,Airbnb,49.9,White,Professionals
483,Airbnb,2.7,Black_or_African_American,Professionals


In [15]:
#Rename ethnicity
new_distributions_df['ethnicity']=new_distributions_df['ethnicity'].replace({'Hispanic_or_Latino': 'Hispanic',
                                                                     'Black_or_African_American': 'African American',
                                                                     'Female_total':'Unknown',
                                                                     'White_female':'White',
                                                                     'Asian_femal':'Asian',
                                                                     'Black_or_African_American_female':'African American',
                                                                     'Hispanic_or_Latino_female':'Hispanic',
                                                                     'Women_of_color':'Others', 
                                                                     'Underrepresented_minorities_female':'Others',
                                                                      'People_of_color':'Others',
                                                                     'Underrepresented_minorities':'Others'})
new_distributions_df['ethnicity'].unique()

array(['Hispanic', 'White', 'African American', 'Asian', 'Unknown',
       'Asian_female', 'Others'], dtype=object)

In [16]:
#Spot check dataset by job_category
new_distributions_df['job_category'].unique()

array(['Professionals', 'Executives', 'All Workers',
       'Executives-Managers-Professionals', 'Executives and Managers',
       'Managers', 'Sales workers/admin support/technicians and others'],
      dtype=object)

In [17]:
#Rename job_category
new_distributions_df['job_category']=new_distributions_df['job_category'].replace({'Executives-Managers-Professionals':'Executives'
                                                                           ,'Executives and Managers':'Executives'
                                                                           ,'Sales workers/admin support/technicians and others':'All Workers'
                                                                          })
new_distributions_df['job_category'].unique()

array(['Professionals', 'Executives', 'All Workers', 'Managers'],
      dtype=object)

In [18]:
new_distributions_df[new_distributions_df.job_category.isin (['All Workers'])].head()

Unnamed: 0,company,ethnicity_pcn,ethnicity,job_category
4602,anonymous,1.0,Hispanic,All Workers
4603,anonymous,1.4,Hispanic,All Workers
4604,anonymous,1.9,Hispanic,All Workers
4605,anonymous,2.4,Hispanic,All Workers
4606,anonymous,2.5,Hispanic,All Workers


# Final Validation


In [19]:
new_distributions_df['company'].unique()

array(['anonymous', 'eBay', 'Nvidia', 'Uber', 'Square', 'Pinterest',
       'Salesforce', 'PayPal', 'MobileIron', 'Facebook', 'NetApp',
       'Twitter', 'Adobe', 'Google', 'Cisco', 'LinkedIn', 'Apple',
       'Airbnb', 'HPE', 'Intuit', 'Intel', 'HP Inc.', 'Sanmina',
       '23andMe', 'Lyft', 'View'], dtype=object)

In [20]:
new_distributions_df['ethnicity'].unique()

array(['Hispanic', 'White', 'African American', 'Asian', 'Unknown',
       'Asian_female', 'Others'], dtype=object)

In [21]:
new_distributions_df['job_category'].unique()

array(['Professionals', 'Executives', 'All Workers', 'Managers'],
      dtype=object)

In [22]:
new_distributions_df.dtypes

company           object
ethnicity_pcn    float64
ethnicity         object
job_category      object
dtype: object

# Import Dataset to PostgreSQL

In [23]:
import sqlalchemy as sa
from sqlalchemy import create_engine
from password import password

In [24]:
# engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/ETL_Project2')
connection = engine.connect()

In [25]:

#Get all table names from the database: Pewlett_Hackard
print (engine.table_names())

['diversity']


In [26]:
# new_diversity.to_sql(name=‘diversity’, con=engine, if_exists=‘append’, index=True)
new_distributions_df.to_sql('distributions', engine)

In [27]:
distributions=engine.execute('Select * from distributions')
distributions

<sqlalchemy.engine.result.ResultProxy at 0x25a23587a20>

In [28]:
print (engine.table_names())

['diversity', 'distributions']
