# Process PSID dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import xmltodict

## Parse the .xml codebook

In [3]:
with open("../data/psid/J322088_codebook.xml", "r", encoding = "cp1252") as file:
    xml_text = file.read()
file.close()

In [4]:
codebook_dict = xmltodict.parse(xml_text)
codebook_dict = codebook_dict["CODEXML"]['LIST_JOBID']['JOBID']['LIST_VARIABLE']['VARIABLE']

In [5]:
codebook_df = pd.DataFrame(codebook_dict)
codebook_df

Unnamed: 0,YEAR,TYPE_ID,NAME,LABEL,QTEXT,ETEXT,LIST_CODE
0,1968,0,ER30000,RELEASE NUMBER,Release Number,,"{'CODE': {'VALUE': '1', 'TEXT': 'Release numbe..."
1,1968,0,ER30001,1968 INTERVIEW NUMBER,1968 Interview Number (1968 ID Number),This variable is the 1968 family ID number. Th...,"{'CODE': [{'VALUE': '1 - 2,930', 'TEXT': 'Memb..."
2,1968,0,ER30002,PERSON NUMBER 68,Person Number,,"{'CODE': [{'VALUE': '1 - 19', 'TEXT': 'Individ..."
3,2001,1,ER17001,RELEASE NUMBER,Release Number,"May 2008: Variables ER20395-ER20458, formerly...","{'CODE': [{'VALUE': '1', 'TEXT': 'Release numb..."
4,2001,1,ER17002,2001 FAMILY INTERVIEW (ID) NUMBER,2001 Interview Number,The values for this variable represent the 200...,"{'CODE': {'VALUE': '1 - 7,457', 'TEXT': 'Inter..."
...,...,...,...,...,...,...,...
428,2021,1,ER81836,IMP WEALTH W/O EQUITY (WEALTH1) 2021,"Constructed Wealth Variable, Excluding Equity","Constructed wealth variable, excluding equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'..."
429,2021,1,ER81838,IMP WEALTH W/ EQUITY (WEALTH2) 2021,"Constructed Wealth Variable, Including Equity.","Constructed wealth variable, including equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'..."
430,2021,2,ER34901,2021 INTERVIEW NUMBER,2021 Interview Number (2021 ID Number),The values for this variable represent the 202...,"{'CODE': [{'VALUE': '1 - 9,614', 'TEXT': '2021..."
431,2021,2,ER34902,SEQUENCE NUMBER 21,2021 Sequence Number,This variable provides a means of identifying ...,"{'CODE': [{'VALUE': '1 - 20', 'TEXT': 'Individ..."


In [6]:
# standardize the column/label names
def label_reorg_func(label:str):
    if "FAMILY INTERVIEW (ID) NUMBER" in label:
        return "family_interview_id"
    if "TOTAL FAMILY INCOME" in label:
        return "total_family_income"
    if "WTR DONATED TO ORGANIZATN FOR HEALTH" in label:
        return "wtr_donated_to_health_org"
    if "DOLLAR AMT OF HEALTH DONATIONS" in label:
        return "amt_of_health_donations"
    if "SEQUENCE NUMBER" in label:
        return "sequence_number"
    if "SEX OF" in label:
        return "sex"
    if "RELATION TO" in label:
        return "relation_to_rp"
    if "CHILDREN" in label:
        return "number_of_children_in_fu"
    if ("HEALTH" in label) and ("STATUS" in label):
        return "health_status"
    if "RELIGIOUS PREF" in label:
        return "religion"
    if "MARITAL STATUS" in label:
        return "marital_status"
    if "RACE OF" in label:
        return "race"
    if "AGE OF" in label:
        return "age"
    if "IMP WEALTH W/O EQUITY" in label:
        return "wealth_wo_equity"
    if "IMP WEALTH W/ EQUITY" in label:
        return "wealth_with_equity"
    if "WTR DONATION&gt;25" in label:
        return "wtr_donated" 
    if "WTR DONATED TO RELIGIOUS ORGANIZATION" in label:
        return "wtr_donated_to_religious_org"
    if "DOLLAR AMT OF RELIGIOUS DONATIONS" in label:
        return "amt_of_religious_donations"
    if "WTR DONATD TO COMBO PURPOSE ORGANIZTN" in label:
        return "wtr_donated_to_combo_purpose_org"
    if "DOLLAR AMT OF COMBO DONATIONS" in label:
        return "amt_of_combo_donations"
    if "WTR DONATED TO ORGANIZATION FOR NEEDY" in label:
        return "wtr_donated_to_org_for_needy"
    if "DOLLAR AMT OF NEEDY DONATIONS" in label:
        return "amt_of_needy_donations"
    if "WTR DONATED TO ORGANZTION FOR EDUCATN" in label:
        return "wtr_donated_to_edu_org"
    if "DOLLAR AMT OF EDUCATION DONATIONS" in label:
        return "amt_of_edu_donations"
    if "WTR DONATED TO YOUTH ORGANIZATIONS" in label:
        return "wtr_donated_to_youth_org"
    if "DOLLAR AMT OF YOUTH ORG DONATIONS" in label:
        return "amt_of_youth_donations"
    if "WTR DONATED TO CULTURAL ORGS" in label:
        return "wtr_donated_to_cultural_org"
    if "DOLLAR AMT OF CULTURAL DONATIONS" in label:
        return "amt_of_cultural_donations"
    if "WTR DONATED TO COMMUNITY ORGS" in label:
        return "wtr_donated_to_community_org"
    if "DOLLAR AMT OF COMMUNITY DONATIONS" in label:
        return "amt_of_community_donations"
    if "WTR DONATED TO ENVIRONMENT ORGS" in label:
        return "wtr_donated_to_env_org"
    if "DOLLAR AMT OF ENVIRONMENT DONATION" in label:
        return "amt_of_env_donations"
    if "WTR DONATED TO OTHER ORGANIZATIONS" in label:
        return "wtr_donated_to_other"
    if ("AMT DONATED TO ORGS IN T7A-F" in label) or ("DOLLAR AMT OF OTHER DONATIONS" in label):
        return "amt_of_other_donations"
    if "WTR DONATED TO INTERNATL/PEACE ORG" in label:
        return "wtr_donated_intl/peace_org"
    if "DOLLAR AMT OF INTERNATL/PEACE DNTN" in label:
        return "amt_of_intl/peace_donations"
    if "PERSON NUMBER" in label:
        return "person_number"
    return label.replace(" ", "_").lower()

codebook_df["STANDARDIZED_LABEL"] = codebook_df.LABEL.apply(label_reorg_func)
codebook_df

Unnamed: 0,YEAR,TYPE_ID,NAME,LABEL,QTEXT,ETEXT,LIST_CODE,STANDARDIZED_LABEL
0,1968,0,ER30000,RELEASE NUMBER,Release Number,,"{'CODE': {'VALUE': '1', 'TEXT': 'Release numbe...",release_number
1,1968,0,ER30001,1968 INTERVIEW NUMBER,1968 Interview Number (1968 ID Number),This variable is the 1968 family ID number. Th...,"{'CODE': [{'VALUE': '1 - 2,930', 'TEXT': 'Memb...",1968_interview_number
2,1968,0,ER30002,PERSON NUMBER 68,Person Number,,"{'CODE': [{'VALUE': '1 - 19', 'TEXT': 'Individ...",person_number
3,2001,1,ER17001,RELEASE NUMBER,Release Number,"May 2008: Variables ER20395-ER20458, formerly...","{'CODE': [{'VALUE': '1', 'TEXT': 'Release numb...",release_number
4,2001,1,ER17002,2001 FAMILY INTERVIEW (ID) NUMBER,2001 Interview Number,The values for this variable represent the 200...,"{'CODE': {'VALUE': '1 - 7,457', 'TEXT': 'Inter...",family_interview_id
...,...,...,...,...,...,...,...,...
428,2021,1,ER81836,IMP WEALTH W/O EQUITY (WEALTH1) 2021,"Constructed Wealth Variable, Excluding Equity","Constructed wealth variable, excluding equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'...",wealth_wo_equity
429,2021,1,ER81838,IMP WEALTH W/ EQUITY (WEALTH2) 2021,"Constructed Wealth Variable, Including Equity.","Constructed wealth variable, including equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'...",wealth_with_equity
430,2021,2,ER34901,2021 INTERVIEW NUMBER,2021 Interview Number (2021 ID Number),The values for this variable represent the 202...,"{'CODE': [{'VALUE': '1 - 9,614', 'TEXT': '2021...",2021_interview_number
431,2021,2,ER34902,SEQUENCE NUMBER 21,2021 Sequence Number,This variable provides a means of identifying ...,"{'CODE': [{'VALUE': '1 - 20', 'TEXT': 'Individ...",sequence_number


In [7]:
codebook_df.STANDARDIZED_LABEL.unique()

array(['release_number', '1968_interview_number', 'person_number',
       'family_interview_id', 'psid_state_of_residence_code', 'age',
       'sex', 'number_of_children_in_fu', '1968_family_identifier',
       'marital_status', 'health_status', 'race', 'religion',
       'wtr_donated', 'wtr_donated_to_religious_org',
       'amt_of_religious_donations', 'wtr_donated_to_combo_purpose_org',
       'amt_of_combo_donations', 'wtr_donated_to_org_for_needy',
       'amt_of_needy_donations', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'wtr_donated_to_edu_org',
       'amt_of_edu_donations', 't7a_wtr_donated_to_youth/family_organztn',
       't7b_wtr_donatd_to_arts/cultr/ethnic_assn',
       't7c_wtr_donatd_to_neighborhood/community',
       't7d_wtr_donated_to_environmental_org',
       't7e_wtr_donatd_to_intrnat_aid/wrld_peace',
       't7f_wtr_donated_to_other_charity', 'amt_of_other_donations',
       'total_family_social_security_income-2000', 'total_family_income',
  

In [8]:
code_mapping_dict_year = codebook_df.set_index("NAME").YEAR.to_dict()
code_mapping_dict_stdlab = codebook_df.set_index("NAME").STANDARDIZED_LABEL.to_dict()

# Wrangle .csv for analysis

In [9]:
# This line does NOT work because the .xlsx is encoded with "synchVertical" rather than "syncVertical",
# and oepnpyxl does not support synchVertical.
# Current solution: use local MS Excel to export data manually to a .csv

# psid_raw = pd.read_excel("../data/psid/J321540.xlsx", sheet_name="Data")

psid_raw = pd.read_csv("../data/psid/J322088.csv")
psid_raw

Unnamed: 0,ER30000,ER30001,ER30002,ER17001,ER17002,ER17004,ER17013,ER17014,ER17016,ER17022,...,ER81406,ER81407,ER81414,ER81416,ER81775,ER81836,ER81838,ER34901,ER34902,ER34903
0,1,4,3,7.0,96.0,41.0,49.0,1.0,1.0,4.0,...,,,,,,,,0,0,0
1,1,4,4,7.0,5987.0,41.0,47.0,2.0,0.0,4.0,...,,,,,,,,0,0,0
2,1,4,6,7.0,5599.0,15.0,43.0,1.0,0.0,4.0,...,0.0,0.0,5.0,0.0,51560.0,2000.0,2000.0,7137,81,10
3,1,4,7,7.0,7091.0,41.0,41.0,2.0,1.0,4.0,...,0.0,0.0,5.0,0.0,11720.0,50000.0,230000.0,4890,1,10
4,1,4,8,7.0,5964.0,41.0,36.0,1.0,3.0,4.0,...,,,,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,1,6872,172,7.0,5477.0,12.0,39.0,1.0,4.0,6872.0,...,,,,,,,,0,0,0
17803,1,6872,174,7.0,5477.0,12.0,39.0,1.0,4.0,6872.0,...,,,,,,,,0,0,0
17804,1,6872,180,7.0,5477.0,12.0,39.0,1.0,4.0,6872.0,...,,,,,,,,0,0,0
17805,1,6872,182,7.0,3616.0,12.0,44.0,1.0,2.0,6872.0,...,,,,,,,,0,0,0


In [10]:
# stash each year's income, contribution, etc. into same columns.
# Strategy: map each var's year and label in codebook and do a groupby

psid_year_grouped = psid_raw.groupby(by = code_mapping_dict_year, axis=1)
#psid_processed = pd.concat([psid_year.rename(columns=code_mapping_dict_stdlab).assign(YEAR = year) for year, psid_year in psid_year_grouped], axis=0)

# this cannot be used as is because families missing certain
# years of interviews are still kept in the original .csv with
# a value of nan in the year's variables
psid_year_grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3947b6e640>

In [11]:
psid_processed = pd.concat([psid_year.rename(columns=code_mapping_dict_stdlab).assign(year = year) for year, psid_year in psid_year_grouped], axis=1)

In [12]:
psid_processed

Unnamed: 0,release_number,1968_interview_number,person_number,year,release_number.1,family_interview_id,psid_state_of_residence_code,age,sex,number_of_children_in_fu,...,mcovid4_dollar_amt_pandemic_donations,wtr_donated_to_other,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,2021_interview_number,sequence_number,relation_to_rp,year.1
0,1,4,3,1968,7.0,96.0,41.0,49.0,1.0,1.0,...,,,,,,,0,0,0,2021
1,1,4,4,1968,7.0,5987.0,41.0,47.0,2.0,0.0,...,,,,,,,0,0,0,2021
2,1,4,6,1968,7.0,5599.0,15.0,43.0,1.0,0.0,...,0.0,5.0,0.0,51560.0,2000.0,2000.0,7137,81,10,2021
3,1,4,7,1968,7.0,7091.0,41.0,41.0,2.0,1.0,...,0.0,5.0,0.0,11720.0,50000.0,230000.0,4890,1,10,2021
4,1,4,8,1968,7.0,5964.0,41.0,36.0,1.0,3.0,...,,,,,,,0,0,0,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,1,6872,172,1968,7.0,5477.0,12.0,39.0,1.0,4.0,...,,,,,,,0,0,0,2021
17803,1,6872,174,1968,7.0,5477.0,12.0,39.0,1.0,4.0,...,,,,,,,0,0,0,2021
17804,1,6872,180,1968,7.0,5477.0,12.0,39.0,1.0,4.0,...,,,,,,,0,0,0,2021
17805,1,6872,182,1968,7.0,3616.0,12.0,44.0,1.0,2.0,...,,,,,,,0,0,0,2021


In [13]:
psid_processed["individual_id"] = psid_processed["1968_interview_number"] * 1000 + psid_processed["person_number"]

In [15]:
psid_processed.columns.unique()

Index(['release_number', '1968_interview_number', 'person_number', 'year',
       'family_interview_id', 'psid_state_of_residence_code', 'age', 'sex',
       'number_of_children_in_fu', '1968_family_identifier', 'marital_status',
       'health_status', 'race', 'religion', 'wtr_donated',
       'wtr_donated_to_religious_org', 'amt_of_religious_donations',
       'wtr_donated_to_combo_purpose_org', 'amt_of_combo_donations',
       'wtr_donated_to_org_for_needy', 'amt_of_needy_donations',
       'wtr_donated_to_health_org', 'amt_of_health_donations',
       'wtr_donated_to_edu_org', 'amt_of_edu_donations',
       't7a_wtr_donated_to_youth/family_organztn',
       't7b_wtr_donatd_to_arts/cultr/ethnic_assn',
       't7c_wtr_donatd_to_neighborhood/community',
       't7d_wtr_donated_to_environmental_org',
       't7e_wtr_donatd_to_intrnat_aid/wrld_peace',
       't7f_wtr_donated_to_other_charity', 'amt_of_other_donations',
       'total_family_social_security_income-2000', 'total_family_inc

In [16]:
psid_sub = psid_processed.drop(columns = ['release_number', '1968_interview_number', 'person_number','2003_interview_number', '2005_interview_number',
       '2007_interview_number', '2009_interview_number',
       '2011_interview_number', '2013_interview_number',
       '2015_interview_number', '2017_interview_number',
       '2019_interview_number','2001_interview_number','2021_interview_number'])

In [17]:
psid_sub

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,race,...,mcovid4_dollar_amt_pandemic_donations,wtr_donated_to_other,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,sequence_number,relation_to_rp,year.1,individual_id
0,1968,96.0,41.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,,,,,,,0,0,2021,4003
1,1968,5987.0,41.0,47.0,2.0,0.0,4.0,4.0,3.0,1.0,...,,,,,,,0,0,2021,4004
2,1968,5599.0,15.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,0.0,5.0,0.0,51560.0,2000.0,2000.0,81,10,2021,4006
3,1968,7091.0,41.0,41.0,2.0,1.0,4.0,4.0,2.0,1.0,...,0.0,5.0,0.0,11720.0,50000.0,230000.0,1,10,2021,4007
4,1968,5964.0,41.0,36.0,1.0,3.0,4.0,1.0,1.0,1.0,...,,,,,,,0,0,2021,4008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,1968,5477.0,12.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,,,,,,,0,0,2021,6872172
17803,1968,5477.0,12.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,,,,,,,0,0,2021,6872174
17804,1968,5477.0,12.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,,,,,,,0,0,2021,6872180
17805,1968,3616.0,12.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,,,,,,,0,0,2021,6872182


### Clean the 2021 data

In [26]:
psid_sub['wtr_donated_2021'] = np.where( ((psid_sub['mgroup1_wtr_donate_combo/needy/med/intl'] == 1) | (psid_sub['mgroup2_wtr_donate_educ/youth/cultr/envr'] == 1) | (psid_sub['mcovid1_wtr_donate_to_help_re_pandemic'] == 1)), 1, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  psid_sub['wtr_donated_2021'] = np.where( ((psid_sub['mgroup1_wtr_donate_combo/needy/med/intl'] == 1) | (psid_sub['mgroup2_wtr_donate_educ/youth/cultr/envr'] == 1) | (psid_sub['mcovid1_wtr_donate_to_help_re_pandemic'] == 1)), 1, 5)


In [27]:
psid_sub

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,race,...,wtr_donated_to_other,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,sequence_number,relation_to_rp,year.1,individual_id,wtr_donated_2021
0,1968,96.0,41.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,,,,,,0,0,2021,4003,5
1,1968,5987.0,41.0,47.0,2.0,0.0,4.0,4.0,3.0,1.0,...,,,,,,0,0,2021,4004,5
2,1968,5599.0,15.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,5.0,0.0,51560.0,2000.0,2000.0,81,10,2021,4006,5
3,1968,7091.0,41.0,41.0,2.0,1.0,4.0,4.0,2.0,1.0,...,5.0,0.0,11720.0,50000.0,230000.0,1,10,2021,4007,5
4,1968,5964.0,41.0,36.0,1.0,3.0,4.0,1.0,1.0,1.0,...,,,,,,0,0,2021,4008,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,1968,5477.0,12.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,,,,,,0,0,2021,6872172,5
17803,1968,5477.0,12.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,,,,,,0,0,2021,6872174,5
17804,1968,5477.0,12.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,,,,,,0,0,2021,6872180,5
17805,1968,3616.0,12.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,,,,,,0,0,2021,6872182,5


In [28]:
psid_sub = psid_sub.drop(columns = ['mgroup1_wtr_donate_combo/needy/med/intl', 'mgroup2_wtr_donate_educ/youth/cultr/envr', 'total_family_social_security_income-2000', 'm52k2_wtr_donated_$25_or_more'])

In [125]:
reg_raw = psid_processed.drop(columns = ['release_number', '1968_interview_number', 'person_number','wtr_donated_to_religious_org',
       'amt_of_religious_donations', 'wtr_donated_to_combo_purpose_org',
       'amt_of_combo_donations', 'wtr_donated_to_org_for_needy',
       'amt_of_needy_donations','wtr_donated_to_edu_org',
       'amt_of_edu_donations', 'amt_of_other_donations','2003_interview_number', '2005_interview_number',
       '2007_interview_number', '2009_interview_number',
       '2011_interview_number', '2013_interview_number',
       '2015_interview_number', '2017_interview_number',
       '2019_interview_number','2001_interview_number','wtr_donated_to_youth_org',
       'amt_of_youth_donations', 'wtr_donated_to_cultural_org',
       'amt_of_cultural_donations', 'wtr_donated_to_community_org',
       'amt_of_community_donations', 'wtr_donated_to_env_org',
       'amt_of_env_donations', 'wtr_donated_intl/peace_org',
       'amt_of_intl/peace_donations', 'wtr_donated_to_other'])

In [126]:
reg_raw.columns.unique()

Index(['year', 'family_interview_id', 'psid_state_of_residence_code',
       'family_composition_change', 'age', 'sex', 'number_of_children_in_fu',
       '1968_family_identifier', 'marital_status', 'health_status', 'race',
       'religion', 'wtr_donated', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'total_family_income', 'wealth_wo_equity',
       'wealth_with_equity', 'sequence_number', 'relation_to_rp',
       'individual_id'],
      dtype='object')

In [18]:
for i in psid_processed_sub.columns.unique():
    print(i)

NameError: name 'psid_processed_sub' is not defined

## Wide to Long

In [143]:
reg_raw.to_csv("reg_raw.csv", index = False)

In [127]:
#test = psid_processed_sub.dropna()
test = reg_raw.dropna()

In [128]:
test

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,...,wtr_donated,wtr_donated_to_health_org,amt_of_health_donations,total_family_income,wealth_wo_equity,wealth_with_equity,sequence_number,relation_to_rp,year.1,individual_id
2,1968,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,...,1.0,5.0,0.0,11028.0,1200.0,1200.0,1,10,2019,4006
5,1968,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,...,5.0,0.0,0.0,9380.0,0.0,0.0,1,10,2019,4031
11,1968,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,...,5.0,0.0,0.0,31000.0,14000.0,14000.0,1,10,2019,4039
13,1968,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,...,5.0,0.0,0.0,65000.0,0.0,20000.0,2,22,2019,4041
15,1968,5964.0,41.0,1.0,36.0,1.0,3.0,4.0,1.0,1.0,...,5.0,0.0,0.0,72200.0,15200.0,15200.0,2,20,2019,4180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17030,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,5.0,0.0,0.0,7518.0,1000.0,1000.0,1,10,2019,6872002
17031,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,5.0,0.0,0.0,103080.0,3900.0,78900.0,1,10,2019,6872003
17032,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,5.0,0.0,0.0,38000.0,34000.0,34000.0,1,10,2019,6872031
17035,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,5.0,0.0,0.0,97000.0,225600.0,255600.0,1,10,2019,6872174


In [86]:
test = test.drop(columns = ["release_number", "1968_interview_number", "person_number", '2001_interview_number', '2003_interview_number',
       '2005_interview_number', '2007_interview_number',
       '2009_interview_number', '2011_interview_number',
       '2013_interview_number', '2015_interview_number',
       '2017_interview_number', '2019_interview_number'])

In [129]:
test.columns.unique()

Index(['year', 'family_interview_id', 'psid_state_of_residence_code',
       'family_composition_change', 'age', 'sex', 'number_of_children_in_fu',
       '1968_family_identifier', 'marital_status', 'health_status', 'race',
       'religion', 'wtr_donated', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'total_family_income', 'wealth_wo_equity',
       'wealth_with_equity', 'sequence_number', 'relation_to_rp',
       'individual_id'],
      dtype='object')

In [131]:
cols = list(test.columns)
len(cols)
y = 2001
for i in range(1, len(cols) - 1, 20):
    for j in range(i, i + 20):
        cols[j] += '-' + str(y)
    y += 2

In [132]:
cols

['year',
 'family_interview_id-2001',
 'psid_state_of_residence_code-2001',
 'family_composition_change-2001',
 'age-2001',
 'sex-2001',
 'number_of_children_in_fu-2001',
 '1968_family_identifier-2001',
 'marital_status-2001',
 'health_status-2001',
 'race-2001',
 'religion-2001',
 'wtr_donated-2001',
 'wtr_donated_to_health_org-2001',
 'amt_of_health_donations-2001',
 'total_family_income-2001',
 'wealth_wo_equity-2001',
 'wealth_with_equity-2001',
 'sequence_number-2001',
 'relation_to_rp-2001',
 'year-2001',
 'family_interview_id-2003',
 'psid_state_of_residence_code-2003',
 'family_composition_change-2003',
 '1968_family_identifier-2003',
 'age-2003',
 'sex-2003',
 'number_of_children_in_fu-2003',
 'marital_status-2003',
 'health_status-2003',
 'race-2003',
 'religion-2003',
 'wtr_donated-2003',
 'wtr_donated_to_health_org-2003',
 'amt_of_health_donations-2003',
 'total_family_income-2003',
 'wealth_wo_equity-2003',
 'wealth_with_equity-2003',
 'sequence_number-2003',
 'relation_to

In [133]:
test.columns = cols

In [134]:
del test["year"]

In [144]:
test

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,wtr_donated-2019,wtr_donated_to_health_org-2019,amt_of_health_donations-2019,total_family_income-2019,wealth_wo_equity-2019,wealth_with_equity-2019,sequence_number-2019,relation_to_rp-2019,year-2019,individual_id
2,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,1.0,5.0,0.0,11028.0,1200.0,1200.0,1,10,2019,4006
5,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,5.0,0.0,0.0,9380.0,0.0,0.0,1,10,2019,4031
11,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,5.0,0.0,0.0,31000.0,14000.0,14000.0,1,10,2019,4039
13,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,5.0,0.0,0.0,65000.0,0.0,20000.0,2,22,2019,4041
15,5964.0,41.0,1.0,36.0,1.0,3.0,4.0,1.0,1.0,1.0,...,5.0,0.0,0.0,72200.0,15200.0,15200.0,2,20,2019,4180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17030,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,5.0,0.0,0.0,7518.0,1000.0,1000.0,1,10,2019,6872002
17031,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,5.0,0.0,0.0,103080.0,3900.0,78900.0,1,10,2019,6872003
17032,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,5.0,0.0,0.0,38000.0,34000.0,34000.0,1,10,2019,6872031
17035,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,2.0,...,5.0,0.0,0.0,97000.0,225600.0,255600.0,1,10,2019,6872174


In [154]:
tutorial = test[(test['sequence_number-2001'] >= 1) & (test['sequence_number-2001'] <= 20) & (test['relation_to_rp-2001'] == 10) & (test['sequence_number-2001'] == 1) &
                (test['sequence_number-2003'] >= 1) & (test['sequence_number-2003'] <= 20) & (test['relation_to_rp-2003'] == 10) & (test['sequence_number-2003'] == 1) &
                (test['sequence_number-2005'] >= 1) & (test['sequence_number-2005'] <= 20) & (test['relation_to_rp-2005'] == 10) & (test['sequence_number-2005'] == 1) &
                (test['sequence_number-2007'] >= 1) & (test['sequence_number-2007'] <= 20) & (test['relation_to_rp-2007'] == 10) & (test['sequence_number-2007'] == 1) &
                (test['sequence_number-2009'] >= 1) & (test['sequence_number-2009'] <= 20) & (test['relation_to_rp-2009'] == 10) & (test['sequence_number-2009'] == 1) &
                (test['sequence_number-2011'] >= 1) & (test['sequence_number-2011'] <= 20) & (test['relation_to_rp-2011'] == 10) & (test['sequence_number-2011'] == 1) &
                (test['sequence_number-2013'] >= 1) & (test['sequence_number-2013'] <= 20) & (test['relation_to_rp-2013'] == 10) & (test['sequence_number-2013'] == 1) &
                (test['sequence_number-2015'] >= 1) & (test['sequence_number-2015'] <= 20) & (test['relation_to_rp-2015'] == 10) & (test['sequence_number-2015'] == 1) &
                (test['sequence_number-2017'] >= 1) & (test['sequence_number-2017'] <= 20) & (test['relation_to_rp-2017'] == 10) & (test['sequence_number-2017'] == 1) &
                (test['sequence_number-2019'] >= 1) & (test['sequence_number-2019'] <= 20) & (test['relation_to_rp-2019'] == 10) & (test['sequence_number-2019'] == 1)]

In [155]:
tutorial

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,wtr_donated-2019,wtr_donated_to_health_org-2019,amt_of_health_donations-2019,total_family_income-2019,wealth_wo_equity-2019,wealth_with_equity-2019,sequence_number-2019,relation_to_rp-2019,year-2019,individual_id
32,1448.0,41.0,0.0,38.0,1.0,2.0,5.0,1.0,2.0,1.0,...,1.0,5.0,0.0,67000.0,124000.0,324000.0,1,10,2019,5003
47,6438.0,26.0,1.0,46.0,1.0,1.0,6.0,1.0,1.0,1.0,...,1.0,5.0,0.0,243000.0,2550000.0,2700000.0,1,10,2019,6006
52,263.0,12.0,1.0,56.0,1.0,0.0,6.0,1.0,1.0,1.0,...,5.0,0.0,0.0,29100.0,40000.0,155000.0,1,10,2019,6170
55,3133.0,34.0,1.0,47.0,1.0,0.0,7.0,1.0,5.0,1.0,...,5.0,0.0,0.0,7596.0,7020.0,7020.0,1,10,2019,7003
56,3198.0,34.0,0.0,45.0,2.0,0.0,7.0,4.0,3.0,1.0,...,5.0,0.0,0.0,23928.0,0.0,0.0,1,10,2019,7004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16987,4711.0,4.0,0.0,42.0,1.0,1.0,6854.0,2.0,3.0,2.0,...,5.0,0.0,0.0,46772.0,5000.0,5000.0,1,10,2019,6854004
17002,4961.0,19.0,1.0,42.0,2.0,1.0,6862.0,1.0,3.0,2.0,...,1.0,5.0,0.0,65000.0,40000.0,40000.0,1,10,2019,6862008
17008,5956.0,42.0,0.0,47.0,2.0,1.0,6864.0,4.0,2.0,2.0,...,5.0,0.0,0.0,29000.0,9000.0,9000.0,1,10,2019,6864002
17016,5955.0,42.0,4.0,24.0,1.0,0.0,6864.0,2.0,2.0,2.0,...,5.0,0.0,0.0,56000.0,100000.0,140000.0,1,10,2019,6864177


In [156]:
tutorial.to_csv("tutorial.csv", index = False)

In [150]:
same = test[test['family_composition_change-2003'].isin([0, 1, 2]) &
            test['family_composition_change-2005'].isin([0, 1, 2]) &
            test['family_composition_change-2007'].isin([0, 1, 2]) &
            test['family_composition_change-2009'].isin([0, 1, 2]) &
            test['family_composition_change-2011'].isin([0, 1, 2]) &
            test['family_composition_change-2013'].isin([0, 1, 2]) &
            test['family_composition_change-2015'].isin([0, 1, 2]) &
            test['family_composition_change-2017'].isin([0, 1, 2]) &
            test['family_composition_change-2019'].isin([0, 1, 2])]
same

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,wtr_donated-2019,wtr_donated_to_health_org-2019,amt_of_health_donations-2019,total_family_income-2019,wealth_wo_equity-2019,wealth_with_equity-2019,sequence_number-2019,relation_to_rp-2019,year-2019,individual_id
32,1448.0,41.0,0.0,38.0,1.0,2.0,5.0,1.0,2.0,1.0,...,1.0,5.0,0.0,67000.0,124000.0,324000.0,1,10,2019,5003
47,6438.0,26.0,1.0,46.0,1.0,1.0,6.0,1.0,1.0,1.0,...,1.0,5.0,0.0,243000.0,2550000.0,2700000.0,1,10,2019,6006
52,263.0,12.0,1.0,56.0,1.0,0.0,6.0,1.0,1.0,1.0,...,5.0,0.0,0.0,29100.0,40000.0,155000.0,1,10,2019,6170
55,3133.0,34.0,1.0,47.0,1.0,0.0,7.0,1.0,5.0,1.0,...,5.0,0.0,0.0,7596.0,7020.0,7020.0,1,10,2019,7003
56,3198.0,34.0,0.0,45.0,2.0,0.0,7.0,4.0,3.0,1.0,...,5.0,0.0,0.0,23928.0,0.0,0.0,1,10,2019,7004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16987,4711.0,4.0,0.0,42.0,1.0,1.0,6854.0,2.0,3.0,2.0,...,5.0,0.0,0.0,46772.0,5000.0,5000.0,1,10,2019,6854004
17002,4961.0,19.0,1.0,42.0,2.0,1.0,6862.0,1.0,3.0,2.0,...,1.0,5.0,0.0,65000.0,40000.0,40000.0,1,10,2019,6862008
17008,5956.0,42.0,0.0,47.0,2.0,1.0,6864.0,4.0,2.0,2.0,...,5.0,0.0,0.0,29000.0,9000.0,9000.0,1,10,2019,6864002
17016,5955.0,42.0,4.0,24.0,1.0,0.0,6864.0,2.0,2.0,2.0,...,5.0,0.0,0.0,56000.0,100000.0,140000.0,1,10,2019,6864177


In [151]:
same.to_csv("same_raw.csv", index = False)

In [135]:
test.to_csv("wide_raw.csv", index = False)

In [136]:
wide_raw = pd.read_csv("wide_raw.csv")

In [106]:
cols = list(wide_raw.columns)

tobeinserted = cols[390: 401]

tobeinserted

cols_sub = cols[0:389]

for i in range(len(tobeinserted) - 1, -1, -1):
    cols_sub.insert(22, tobeinserted[i])

cols_sub

test = wide_raw[cols_sub]

test["individual_id"] = wide_raw["individual_id"]

test

In [141]:
long_raw = pd.wide_to_long(test, ['family_interview_id',
       'psid_state_of_residence_code', 'family_composition_change', 'age',
       'sex', 'number_of_children_in_fu', '1968_family_identifier',
       'marital_status', 'health_status', 'race', 'religion', 'wtr_donated',
       'wtr_donated_to_health_org', 'amt_of_health_donations',
       'total_family_income', 'wealth_wo_equity',
       'wealth_with_equity', 'sequence_number',
       'relation_to_rp',  'year'],  i = ["individual_id"], j = "YEAR", sep = '-')