In [2]:
#This is a test notebook. 

import pandas as pd
import csv

#This line of code clears the output. This way we can double check the code is actually
#running. 
from IPython.display import clear_output
for i in range(10):
    clear_output()

#importing the fip dataset
fipdata = pd.read_csv('https://raw.githubusercontent.com/EvanLih/PUBPOL599_Right_To_Work/master/Dataset/Right%20to%20Work%20by%20State.csv')

#We are removing the unecessary columns from the fip dataset, and only
#keeping the "State" column at "Right to Work" column,. 
fipdata.drop(fipdata.columns[3:256], axis = 1, inplace = True)
fipdata.drop(fipdata.columns[1], axis = 1, inplace = True)

#We are renaming the columns to make it easier to read
fipdata.rename(columns = {'Right to Work by State' :'State', 'Unnamed: 1' : 'FIPS', 'Unnamed: 2' : 'Right_to_Work'}, inplace = True)


#We are dropping the first row of the dataset, as they don't contain values - only column names
fipdata.drop(fipdata.index[0], inplace = True)

#Adding District of Columbia, as it is not present in original dataframe
fipdata = fipdata.append({'State': 'District of Columbia', 'Right_to_Work' : 1}, ignore_index = True)

#outputting the dataframe to make sure it looks ok. 
fipdata


Unnamed: 0,State,Right_to_Work
0,Alabama,1
1,Alaska,0
2,Arizona,1
3,Arkansas,1
4,California,0
5,Colorado,0
6,Connecticut,0
7,Delaware,0
8,Florida,1
9,Georgia,1


In [5]:
#This line of code clears the output. This way we can double check the code is actually
#running. 
from IPython.display import clear_output
for i in range(10):
    clear_output()


#Reading in the ipums dataset
ipumsdata = pd.read_stata('https://raw.githubusercontent.com/EvanLih/PUBPOL599_Right_To_Work/master/ipums.dta')

#Preparing for merge, so changing ipums dataset state column to match fipdata
ipumsdata.rename(columns = {'statefip' : 'State'}, inplace = True)

#Printing out a list of states to double check if everything is working
test = ipumsdata['State'].unique()
list(test)


['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [52]:
ipumsdata.head(10)

Unnamed: 0,serial,State,sex,age,school,inctot,incwage
0,1.0,Alabama,Male,73,"No, not in school",10000,0
1,2.0,Alabama,Female,31,"No, not in school",38500,38500
2,3.0,Alabama,Male,41,"No, not in school",82000,72000
3,3.0,Alabama,Female,48,"No, not in school",8700,0
4,3.0,Alabama,Male,16,"Yes, in school",0,0
5,4.0,Alabama,Female,37,"No, not in school",18300,18000
6,4.0,Alabama,Female,18,"Yes, in school",0,0
7,4.0,Alabama,Male,17,"Yes, in school",8800,0
8,4.0,Alabama,Female,7,"Yes, in school",9999999,999999
9,4.0,Alabama,Female,3,"No, not in school",9999999,999999


In [7]:
#This line of code clears the output. This way we can double check the code is actually
#running. 
from IPython.display import clear_output
for i in range(10):
    clear_output()


#We are merging the two dataframes into a new dataframe called combinedResults
combinedResults = pd.merge(fipdata, ipumsdata, on = 'State', how = 'right')

## Cleaning combined results:

#Drop total income column from table
combinedResults.drop(combinedResults.columns[6], axis = 1, inplace = True)

#Drop 999999 (N/A) from incwage column
combinedResults = combinedResults[combinedResults.incwage != 999999]

#Drop weird data from age column
combinedResults = combinedResults[combinedResults.age != "90 (90+ in 1980 and 1990)"]

#convert age to numeric
combinedResults.age=pd.to_numeric(combinedResults.age)

#Limit results to working population 25-65
combinedResults = combinedResults[combinedResults.age > 25]
combinedResults = combinedResults[combinedResults.age < 65]

#Replace the state name with the state you want to see. In this case, it is california. 
#A simple test to see if the Right to Work column is outputting the correct value
combinedResults[combinedResults['State'].str.contains("California")]

#This dataset only contains states, right to work, and incwage. We are ignoring 
#inctot (total income), as we are primarily looking at wage data. 

Unnamed: 0,State,Right_to_Work,serial,sex,age,school,incwage
153275,California,0,68211.0,Male,45,"No, not in school",40000
153276,California,0,68211.0,Female,39,"Yes, in school",4000
153278,California,0,68212.0,Male,34,"No, not in school",130000
153279,California,0,68212.0,Female,33,"No, not in school",0
153283,California,0,68214.0,Male,35,"No, not in school",58000
153284,California,0,68214.0,Female,31,"No, not in school",38000
153288,California,0,68216.0,Male,47,"No, not in school",0
153289,California,0,68216.0,Female,41,"No, not in school",70000
153290,California,0,68217.0,Female,31,"No, not in school",76000
153291,California,0,68217.0,Male,34,"No, not in school",81000


In [17]:
#With the finalized dataset, we are outputting the dataframe into a CSV for 
#R data visualization/statistical analysis. 
import os  
combinedResults.to_csv('Cleaned_RTW_Dataset')
#Checking if the file exists. 
test = os.path.isfile('./Cleaned_RTW_Dataset')

if test == True :
    print('Cleaned_RTW_Dataset was created. Please see "dataset" file to find)
    else:
        print('Dataset was not created properly. Please retry!')



this works


In [9]:
combinedResults.head(10)

Unnamed: 0,State,Right_to_Work,serial,sex,age,school,incwage
1,Alabama,1,2.0,Female,31,"No, not in school",38500
2,Alabama,1,3.0,Male,41,"No, not in school",72000
3,Alabama,1,3.0,Female,48,"No, not in school",0
5,Alabama,1,4.0,Female,37,"No, not in school",18000
10,Alabama,1,5.0,Male,32,"No, not in school",65000
11,Alabama,1,5.0,Female,54,"No, not in school",57000
19,Alabama,1,9.0,Male,32,"No, not in school",43500
20,Alabama,1,9.0,Male,60,"No, not in school",0
21,Alabama,1,9.0,Female,58,"No, not in school",0
22,Alabama,1,10.0,Male,61,"No, not in school",160000
