# Dataset: Tech Sector Diversity 2016 

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import date

In [2]:
#Load the data
diversity_demo= pd.read_csv ("Resources/tech_sector_diversity_demographics_2016.csv")

In [3]:
diversity_df=pd.DataFrame(diversity_demo)
diversity_df.head(3)

Unnamed: 0,job_category,race_ethnicity,gender,count,percentage
0,All workers,White,Male,268883,41.257252
1,All workers,White,Female,105560,16.197065
2,All workers,Black_or_African American,Male,17508,2.686417


# Step 1) Inspect Data

In [4]:
#Identify how many rows and columns in this dataset
diversity_df.shape

(44, 5)

In [5]:
#Verify Data type in this dataset
diversity_df.dtypes
diversity_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 5 columns):
job_category      44 non-null object
race_ethnicity    44 non-null object
gender            44 non-null object
count             44 non-null int64
percentage        44 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 9.2 KB


In [6]:
#Identify any N/A or missing data, if 
# diversity_df.isnull().any().any()
diversity_df.isna().sum()

job_category      0
race_ethnicity    0
gender            0
count             0
percentage        0
dtype: int64

# Step 2) Clean and Transform data

In [7]:
diversity_df.columns

Index(['job_category', 'race_ethnicity', 'gender', 'count', 'percentage'], dtype='object')

In [8]:
#rename columns names
diversity_df=diversity_df.rename(columns={'race_ethnicity':'ethnicity',
                            'count': 'headcount',
                            'percentage': 'headcount_pcn'})

In [9]:
diversity_df.columns

Index(['job_category', 'ethnicity', 'gender', 'headcount', 'headcount_pcn'], dtype='object')

In [10]:
#Verify and ensure job_category is properly assigned
diversity_df['job_category'].unique()

array(['All workers', 'Executives', 'Managers', 'Professionals'],
      dtype=object)

In [11]:
#spot check any values that is not ethnicity related then re-assigned to type: unknown 
diversity_df['ethnicity'].unique()

array(['White', 'Black_or_African American', 'Asian',
       'Hispanic_or_Latino', 'All', 'Totals'], dtype=object)

Replace values

In [12]:
# Replace values for "race_ethnicity" and "gender" columns

diversity_df['ethnicity']=diversity_df['ethnicity'].replace({'Black_or_African American': 'African American',
                                                             'Hispanic_or_Latino': 'Hispanic',
                                                                    'All': 'Others',
                                                                    'Totals': 'Others'})
diversity_df['ethnicity'].unique()

array(['White', 'African American', 'Asian', 'Hispanic', 'Others'],
      dtype=object)

In [13]:
#spot check detail values under gender column
diversity_df['gender'].unique()

array(['Male', 'Female', 'Both'], dtype=object)

In [14]:
#replace 'Both' to "Unknown"
diversity_df['gender']=diversity_df['gender'].replace({'Both': 'Unknown'})
diversity_df.head(2)

Unnamed: 0,job_category,ethnicity,gender,headcount,headcount_pcn
0,All workers,White,Male,268883,41.257252
1,All workers,White,Female,105560,16.197065


# Final Validation 

In [15]:
diversity_df['ethnicity'].unique()

array(['White', 'African American', 'Asian', 'Hispanic', 'Others'],
      dtype=object)

In [16]:
diversity_df['gender'].unique()

array(['Male', 'Female', 'Unknown'], dtype=object)

In [17]:
# Add timeline to the dataset
diversity_df.loc[:,'year']=2016
diversity_df.head(2)

Unnamed: 0,job_category,ethnicity,gender,headcount,headcount_pcn,year
0,All workers,White,Male,268883,41.257252,2016
1,All workers,White,Female,105560,16.197065,2016


In [18]:
#Re-arrange column order in dataframe
new_diversity=diversity_df.loc[:, ::-1].head()

In [19]:
new_diversity

Unnamed: 0,year,headcount_pcn,headcount,gender,ethnicity,job_category
0,2016,41.257252,268883,Male,White,All workers
1,2016,16.197065,105560,Female,White,All workers
2,2016,2.686417,17508,Male,African American,All workers
3,2016,1.761331,11479,Female,African American,All workers
4,2016,19.233171,125347,Male,Asian,All workers


In [20]:
new_diversity.dtypes

year               int64
headcount_pcn    float64
headcount          int64
gender            object
ethnicity         object
job_category      object
dtype: object

# Import Dataset to PostgreSQL

In [21]:
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import create_engine

from password import password

In [22]:
# engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/ETL_Project2')
connection = engine.connect()

In [23]:

#Get all table names from the database: Pewlett_Hackard
print (engine.table_names())

[]


In [24]:
# new_diversity.to_sql(name=‘diversity’, con=engine, if_exists=‘append’, index=True)
new_diversity.to_sql('diversity', engine)

In [25]:
diversity=engine.execute('Select * from diversity')
diversity

<sqlalchemy.engine.result.ResultProxy at 0x1febd20c358>

In [26]:
print (engine.table_names())

['diversity']
