# Process PSID dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import xmltodict

## Parse the .xml codebook

In [72]:
with open("../data/psid/J322214_codebook.xml", "r", encoding = "cp1252") as file:
    xml_text = file.read()
file.close()

In [73]:
codebook_dict = xmltodict.parse(xml_text)
codebook_dict = codebook_dict["CODEXML"]['LIST_JOBID']['JOBID']['LIST_VARIABLE']['VARIABLE']

In [74]:
codebook_df = pd.DataFrame(codebook_dict)
codebook_df

Unnamed: 0,YEAR,TYPE_ID,NAME,LABEL,QTEXT,ETEXT,LIST_CODE
0,1968,0,ER30000,RELEASE NUMBER,Release Number,,"{'CODE': {'VALUE': '1', 'TEXT': 'Release numbe..."
1,1968,0,ER30001,1968 INTERVIEW NUMBER,1968 Interview Number (1968 ID Number),This variable is the 1968 family ID number. Th...,"{'CODE': [{'VALUE': '1 - 2,930', 'TEXT': 'Memb..."
2,1968,0,ER30002,PERSON NUMBER 68,Person Number,,"{'CODE': [{'VALUE': '1 - 19', 'TEXT': 'Individ..."
3,2001,1,ER17001,RELEASE NUMBER,Release Number,"May 2008: Variables ER20395-ER20458, formerly...","{'CODE': [{'VALUE': '1', 'TEXT': 'Release numb..."
4,2001,1,ER17002,2001 FAMILY INTERVIEW (ID) NUMBER,2001 Interview Number,The values for this variable represent the 200...,"{'CODE': {'VALUE': '1 - 7,457', 'TEXT': 'Inter..."
...,...,...,...,...,...,...,...
450,2021,1,ER81838,IMP WEALTH W/ EQUITY (WEALTH2) 2021,"Constructed Wealth Variable, Including Equity.","Constructed wealth variable, including equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'..."
451,2021,1,ER81958,2021 CORE/IMMIGRANT FAM WEIGHT NUMBER 1,2021 Core/Immigrant Family Longitudinal Weight,The weight is constructed by summing the indiv...,"{'CODE': {'VALUE': '.001 - 200.000', 'TEXT': '..."
452,2021,2,ER34901,2021 INTERVIEW NUMBER,2021 Interview Number (2021 ID Number),The values for this variable represent the 202...,"{'CODE': [{'VALUE': '1 - 9,614', 'TEXT': '2021..."
453,2021,2,ER34902,SEQUENCE NUMBER 21,2021 Sequence Number,This variable provides a means of identifying ...,"{'CODE': [{'VALUE': '1 - 20', 'TEXT': 'Individ..."


In [75]:
# standardize the column/label names
def label_reorg_func(label:str):
    if "FAMILY INTERVIEW (ID) NUMBER" in label:
        return "family_interview_id"
    if "TOTAL FAMILY INCOME" in label:
        return "total_family_income"
    if "WTR DONATED TO ORGANIZATN FOR HEALTH" in label:
        return "wtr_donated_to_health_org"
    if "DOLLAR AMT OF HEALTH DONATIONS" in label:
        return "amt_of_health_donations"
    if "SEQUENCE NUMBER" in label:
        return "sequence_number"
    if "SEX OF" in label:
        return "sex"
    if "RELATION TO" in label:
        return "relation_to_rp"
    if "CHILDREN" in label:
        return "number_of_children_in_fu"
    if ("HEALTH" in label) and ("STATUS" in label):
        return "health_status"
    if "RELIGIOUS PREF" in label:
        return "religion"
    if "MARITAL STATUS" in label:
        return "marital_status"
    if "RACE OF" in label:
        return "race"
    if "AGE OF" in label:
        return "age"
    if "IMP WEALTH W/O EQUITY" in label:
        return "wealth_wo_equity"
    if "IMP WEALTH W/ EQUITY" in label:
        return "wealth_with_equity"
    if "WTR DONATION&gt;25" in label:
        return "wtr_donated" 
    if "WTR DONATED TO RELIGIOUS ORGANIZATION" in label:
        return "wtr_donated_to_religious_org"
    if "DOLLAR AMT OF RELIGIOUS DONATIONS" in label:
        return "amt_of_religious_donations"
    if "WTR DONATD TO COMBO PURPOSE ORGANIZTN" in label:
        return "wtr_donated_to_combo_purpose_org"
    if "DOLLAR AMT OF COMBO DONATIONS" in label:
        return "amt_of_combo_donations"
    if "WTR DONATED TO ORGANIZATION FOR NEEDY" in label:
        return "wtr_donated_to_org_for_needy"
    if "DOLLAR AMT OF NEEDY DONATIONS" in label:
        return "amt_of_needy_donations"
    if "WTR DONATED TO ORGANZTION FOR EDUCATN" in label:
        return "wtr_donated_to_edu_org"
    if "DOLLAR AMT OF EDUCATION DONATIONS" in label:
        return "amt_of_edu_donations"
    if "WTR DONATED TO YOUTH ORGANIZATIONS" in label:
        return "wtr_donated_to_youth_org"
    if "DOLLAR AMT OF YOUTH ORG DONATIONS" in label:
        return "amt_of_youth_donations"
    if "WTR DONATED TO CULTURAL ORGS" in label:
        return "wtr_donated_to_cultural_org"
    if "DOLLAR AMT OF CULTURAL DONATIONS" in label:
        return "amt_of_cultural_donations"
    if "WTR DONATED TO COMMUNITY ORGS" in label:
        return "wtr_donated_to_community_org"
    if "DOLLAR AMT OF COMMUNITY DONATIONS" in label:
        return "amt_of_community_donations"
    if "WTR DONATED TO ENVIRONMENT ORGS" in label:
        return "wtr_donated_to_env_org"
    if "DOLLAR AMT OF ENVIRONMENT DONATION" in label:
        return "amt_of_env_donations"
    if "WTR DONATED TO OTHER ORGANIZATIONS" in label:
        return "wtr_donated_to_other"
    if ("AMT DONATED TO ORGS IN T7A-F" in label) or ("DOLLAR AMT OF OTHER DONATIONS" in label):
        return "amt_of_other_donations"
    if "WTR DONATED TO INTERNATL/PEACE ORG" in label:
        return "wtr_donated_intl/peace_org"
    if "DOLLAR AMT OF INTERNATL/PEACE DNTN" in label:
        return "amt_of_intl/peace_donations"
    if "PERSON NUMBER" in label:
        return "person_number"
    if "WEIGHT" in label:
        return "weight"
    return label.replace(" ", "_").lower()

codebook_df["STANDARDIZED_LABEL"] = codebook_df.LABEL.apply(label_reorg_func)
codebook_df

Unnamed: 0,YEAR,TYPE_ID,NAME,LABEL,QTEXT,ETEXT,LIST_CODE,STANDARDIZED_LABEL
0,1968,0,ER30000,RELEASE NUMBER,Release Number,,"{'CODE': {'VALUE': '1', 'TEXT': 'Release numbe...",release_number
1,1968,0,ER30001,1968 INTERVIEW NUMBER,1968 Interview Number (1968 ID Number),This variable is the 1968 family ID number. Th...,"{'CODE': [{'VALUE': '1 - 2,930', 'TEXT': 'Memb...",1968_interview_number
2,1968,0,ER30002,PERSON NUMBER 68,Person Number,,"{'CODE': [{'VALUE': '1 - 19', 'TEXT': 'Individ...",person_number
3,2001,1,ER17001,RELEASE NUMBER,Release Number,"May 2008: Variables ER20395-ER20458, formerly...","{'CODE': [{'VALUE': '1', 'TEXT': 'Release numb...",release_number
4,2001,1,ER17002,2001 FAMILY INTERVIEW (ID) NUMBER,2001 Interview Number,The values for this variable represent the 200...,"{'CODE': {'VALUE': '1 - 7,457', 'TEXT': 'Inter...",family_interview_id
...,...,...,...,...,...,...,...,...
450,2021,1,ER81838,IMP WEALTH W/ EQUITY (WEALTH2) 2021,"Constructed Wealth Variable, Including Equity.","Constructed wealth variable, including equity....","{'CODE': [{'VALUE': '-99,999,997 - -1', 'TEXT'...",wealth_with_equity
451,2021,1,ER81958,2021 CORE/IMMIGRANT FAM WEIGHT NUMBER 1,2021 Core/Immigrant Family Longitudinal Weight,The weight is constructed by summing the indiv...,"{'CODE': {'VALUE': '.001 - 200.000', 'TEXT': '...",weight
452,2021,2,ER34901,2021 INTERVIEW NUMBER,2021 Interview Number (2021 ID Number),The values for this variable represent the 202...,"{'CODE': [{'VALUE': '1 - 9,614', 'TEXT': '2021...",2021_interview_number
453,2021,2,ER34902,SEQUENCE NUMBER 21,2021 Sequence Number,This variable provides a means of identifying ...,"{'CODE': [{'VALUE': '1 - 20', 'TEXT': 'Individ...",sequence_number


In [76]:
codebook_df.STANDARDIZED_LABEL.unique()

array(['release_number', '1968_interview_number', 'person_number',
       'family_interview_id', 'psid_state_of_residence_code',
       'family_composition_change', 'age', 'sex',
       'number_of_children_in_fu', '1968_family_identifier',
       'marital_status', 'health_status', 'race', 'religion',
       'wtr_donated', 'wtr_donated_to_religious_org',
       'amt_of_religious_donations', 'wtr_donated_to_combo_purpose_org',
       'amt_of_combo_donations', 'wtr_donated_to_org_for_needy',
       'amt_of_needy_donations', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'wtr_donated_to_edu_org',
       'amt_of_edu_donations', 't7a_wtr_donated_to_youth/family_organztn',
       't7b_wtr_donatd_to_arts/cultr/ethnic_assn',
       't7c_wtr_donatd_to_neighborhood/community',
       't7d_wtr_donated_to_environmental_org',
       't7e_wtr_donatd_to_intrnat_aid/wrld_peace',
       't7f_wtr_donated_to_other_charity', 'amt_of_other_donations',
       'weight', 'total_family_social_se

In [77]:
code_mapping_dict_year = codebook_df.set_index("NAME").YEAR.to_dict()
code_mapping_dict_stdlab = codebook_df.set_index("NAME").STANDARDIZED_LABEL.to_dict()

# Wrangle .csv for analysis

In [78]:
# This line does NOT work because the .xlsx is encoded with "synchVertical" rather than "syncVertical",
# and oepnpyxl does not support synchVertical.
# Current solution: use local MS Excel to export data manually to a .csv

# psid_raw = pd.read_excel("../data/psid/J321540.xlsx", sheet_name="Data")

psid_raw = pd.read_csv("../data/psid/J322214.csv")
psid_raw

Unnamed: 0,ER30000,ER30001,ER30002,ER17001,ER17002,ER17004,ER17007,ER17013,ER17014,ER17016,...,ER81407,ER81414,ER81416,ER81775,ER81836,ER81838,ER81958,ER34901,ER34902,ER34903
0,1,4,3,7.0,96.0,41.0,1.0,49.0,1.0,1.0,...,,,,,,,,0,0,0
1,1,4,4,7.0,5987.0,41.0,0.0,47.0,2.0,0.0,...,,,,,,,,0,0,0
2,1,4,5,7.0,6872.0,41.0,1.0,57.0,1.0,1.0,...,,,,,,,,0,0,0
3,1,4,6,7.0,5599.0,15.0,0.0,43.0,1.0,0.0,...,0.0,5.0,0.0,51560.0,2000.0,2000.0,23.045,7137,81,10
4,1,4,7,7.0,7091.0,41.0,0.0,41.0,2.0,1.0,...,0.0,5.0,0.0,11720.0,50000.0,230000.0,40.128,4890,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43011,1,6872,181,7.0,5477.0,12.0,0.0,39.0,1.0,4.0,...,,,,,,,,0,0,0
43012,1,6872,182,7.0,3616.0,12.0,0.0,44.0,1.0,2.0,...,,,,,,,,0,0,0
43013,1,6872,183,7.0,3616.0,12.0,0.0,44.0,1.0,2.0,...,0.0,5.0,0.0,160413.0,63500.0,133500.0,2.247,5084,1,10
43014,1,6872,184,7.0,5477.0,12.0,0.0,39.0,1.0,4.0,...,,,,,,,,0,0,0


In [79]:
# stash each year's income, contribution, etc. into same columns.
# Strategy: map each var's year and label in codebook and do a groupby

psid_year_grouped = psid_raw.groupby(by = code_mapping_dict_year, axis=1)
psid_processed_long = pd.concat([psid_year.rename(columns=code_mapping_dict_stdlab).assign(YEAR = year) for year, psid_year in psid_year_grouped], axis=0)

# this cannot be used as is because families missing certain
# years of interviews are still kept in the original .csv with
# a value of nan in the year's variables


In [80]:
psid_processed_long

Unnamed: 0,release_number,1968_interview_number,person_number,YEAR,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,...,2015_interview_number,2017_interview_number,2019_interview_number,mgroup1_wtr_donate_combo/needy/med/intl,mgroup2_wtr_donate_educ/youth/cultr/envr,mcovid1_wtr_donate_to_help_re_pandemic,mcovid2_wtr_pandemic_donations_crowdfund,mcovid3_largest_crowdfund_donation_to,mcovid4_dollar_amt_pandemic_donations,2021_interview_number
0,1.0,4.0,3.0,1968,,,,,,,...,,,,,,,,,,
1,1.0,4.0,4.0,1968,,,,,,,...,,,,,,,,,,
2,1.0,4.0,5.0,1968,,,,,,,...,,,,,,,,,,
3,1.0,4.0,6.0,1968,,,,,,,...,,,,,,,,,,
4,1.0,4.0,7.0,1968,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43011,,,,2021,,,,,,,...,,,,,,,,,,0.0
43012,,,,2021,,,,,,,...,,,,,,,,,,0.0
43013,1.0,,,2021,5084.0,42.0,0.0,41.0,1.0,2.0,...,,,,5.0,5.0,5.0,0.0,0.0,0.0,5084.0
43014,,,,2021,,,,,,,...,,,,,,,,,,0.0


In [81]:
psid_processed = pd.concat([psid_year.rename(columns=code_mapping_dict_stdlab).assign(year = year) for year, psid_year in psid_year_grouped], axis=1)

In [82]:
psid_processed

Unnamed: 0,release_number,1968_interview_number,person_number,year,release_number.1,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,...,wtr_donated_to_other,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,weight,2021_interview_number,sequence_number,relation_to_rp,year.1
0,1,4,3,1968,7.0,96.0,41.0,1.0,49.0,1.0,...,,,,,,,0,0,0,2021
1,1,4,4,1968,7.0,5987.0,41.0,0.0,47.0,2.0,...,,,,,,,0,0,0,2021
2,1,4,5,1968,7.0,6872.0,41.0,1.0,57.0,1.0,...,,,,,,,0,0,0,2021
3,1,4,6,1968,7.0,5599.0,15.0,0.0,43.0,1.0,...,5.0,0.0,51560.0,2000.0,2000.0,23.045,7137,81,10,2021
4,1,4,7,1968,7.0,7091.0,41.0,0.0,41.0,2.0,...,5.0,0.0,11720.0,50000.0,230000.0,40.128,4890,1,10,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43011,1,6872,181,1968,7.0,5477.0,12.0,0.0,39.0,1.0,...,,,,,,,0,0,0,2021
43012,1,6872,182,1968,7.0,3616.0,12.0,0.0,44.0,1.0,...,,,,,,,0,0,0,2021
43013,1,6872,183,1968,7.0,3616.0,12.0,0.0,44.0,1.0,...,5.0,0.0,160413.0,63500.0,133500.0,2.247,5084,1,10,2021
43014,1,6872,184,1968,7.0,5477.0,12.0,0.0,39.0,1.0,...,,,,,,,0,0,0,2021


In [83]:
psid_processed["individual_id"] = psid_processed["1968_interview_number"] * 1000 + psid_processed["person_number"]

In [84]:
psid_processed.columns.unique()

Index(['release_number', '1968_interview_number', 'person_number', 'year',
       'family_interview_id', 'psid_state_of_residence_code',
       'family_composition_change', 'age', 'sex', 'number_of_children_in_fu',
       '1968_family_identifier', 'marital_status', 'health_status', 'race',
       'religion', 'wtr_donated', 'wtr_donated_to_religious_org',
       'amt_of_religious_donations', 'wtr_donated_to_combo_purpose_org',
       'amt_of_combo_donations', 'wtr_donated_to_org_for_needy',
       'amt_of_needy_donations', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'wtr_donated_to_edu_org',
       'amt_of_edu_donations', 't7a_wtr_donated_to_youth/family_organztn',
       't7b_wtr_donatd_to_arts/cultr/ethnic_assn',
       't7c_wtr_donatd_to_neighborhood/community',
       't7d_wtr_donated_to_environmental_org',
       't7e_wtr_donatd_to_intrnat_aid/wrld_peace',
       't7f_wtr_donated_to_other_charity', 'amt_of_other_donations', 'weight',
       'total_family_social_s

In [173]:
psid_sub = psid_processed.drop(columns = ['release_number', '1968_interview_number', 'person_number','2003_interview_number', '2005_interview_number',
       '2007_interview_number', '2009_interview_number',
       '2011_interview_number', '2013_interview_number',
       '2015_interview_number', '2017_interview_number',
       '2019_interview_number','2001_interview_number','2021_interview_number'])

In [174]:
psid_sub

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,...,wtr_donated_to_other,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,weight,sequence_number,relation_to_rp,year.1,individual_id
0,1968,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,...,,,,,,,0,0,2021,4003
1,1968,5987.0,41.0,0.0,47.0,2.0,0.0,4.0,4.0,3.0,...,,,,,,,0,0,2021,4004
2,1968,6872.0,41.0,1.0,57.0,1.0,1.0,4.0,1.0,3.0,...,,,,,,,0,0,2021,4005
3,1968,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,...,5.0,0.0,51560.0,2000.0,2000.0,23.045,81,10,2021,4006
4,1968,7091.0,41.0,0.0,41.0,2.0,1.0,4.0,4.0,2.0,...,5.0,0.0,11720.0,50000.0,230000.0,40.128,1,10,2021,4007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43011,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,,,,,,,0,0,2021,6872181
43012,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,,,,,,,0,0,2021,6872182
43013,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,5.0,0.0,160413.0,63500.0,133500.0,2.247,1,10,2021,6872183
43014,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,,,,,,,0,0,2021,6872184


### Clean the 2021 data

In [175]:
psid_sub['wtr_donated-2021'] = np.where( ((psid_sub['mgroup1_wtr_donate_combo/needy/med/intl'] == 1) | (psid_sub['mgroup2_wtr_donate_educ/youth/cultr/envr'] == 1) | (psid_sub['mcovid1_wtr_donate_to_help_re_pandemic'] == 1)), 1, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  psid_sub['wtr_donated-2021'] = np.where( ((psid_sub['mgroup1_wtr_donate_combo/needy/med/intl'] == 1) | (psid_sub['mgroup2_wtr_donate_educ/youth/cultr/envr'] == 1) | (psid_sub['mcovid1_wtr_donate_to_help_re_pandemic'] == 1)), 1, 5)


In [176]:
psid_sub

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,...,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,weight,sequence_number,relation_to_rp,year.1,individual_id,wtr_donated-2021
0,1968,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,...,,,,,,0,0,2021,4003,5
1,1968,5987.0,41.0,0.0,47.0,2.0,0.0,4.0,4.0,3.0,...,,,,,,0,0,2021,4004,5
2,1968,6872.0,41.0,1.0,57.0,1.0,1.0,4.0,1.0,3.0,...,,,,,,0,0,2021,4005,5
3,1968,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,...,0.0,51560.0,2000.0,2000.0,23.045,81,10,2021,4006,5
4,1968,7091.0,41.0,0.0,41.0,2.0,1.0,4.0,4.0,2.0,...,0.0,11720.0,50000.0,230000.0,40.128,1,10,2021,4007,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43011,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,,,,,,0,0,2021,6872181,5
43012,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,,,,,,0,0,2021,6872182,5
43013,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,0.0,160413.0,63500.0,133500.0,2.247,1,10,2021,6872183,5
43014,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,,,,,,0,0,2021,6872184,5


In [177]:
psid_sub = psid_sub.drop(columns = ['mgroup1_wtr_donate_combo/needy/med/intl', 'mgroup2_wtr_donate_educ/youth/cultr/envr', 'total_family_social_security_income-2000', 'm52k2_wtr_donated_$25_or_more'])

In [178]:
psid_sub

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,...,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,weight,sequence_number,relation_to_rp,year.1,individual_id,wtr_donated-2021
0,1968,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,...,,,,,,0,0,2021,4003,5
1,1968,5987.0,41.0,0.0,47.0,2.0,0.0,4.0,4.0,3.0,...,,,,,,0,0,2021,4004,5
2,1968,6872.0,41.0,1.0,57.0,1.0,1.0,4.0,1.0,3.0,...,,,,,,0,0,2021,4005,5
3,1968,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,...,0.0,51560.0,2000.0,2000.0,23.045,81,10,2021,4006,5
4,1968,7091.0,41.0,0.0,41.0,2.0,1.0,4.0,4.0,2.0,...,0.0,11720.0,50000.0,230000.0,40.128,1,10,2021,4007,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43011,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,,,,,,0,0,2021,6872181,5
43012,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,,,,,,0,0,2021,6872182,5
43013,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,0.0,160413.0,63500.0,133500.0,2.247,1,10,2021,6872183,5
43014,1968,5477.0,12.0,0.0,39.0,1.0,4.0,6872.0,1.0,4.0,...,,,,,,0,0,2021,6872184,5


In [179]:
psid_sub.columns.unique()

Index(['year', 'family_interview_id', 'psid_state_of_residence_code',
       'family_composition_change', 'age', 'sex', 'number_of_children_in_fu',
       '1968_family_identifier', 'marital_status', 'health_status', 'race',
       'religion', 'wtr_donated', 'wtr_donated_to_religious_org',
       'amt_of_religious_donations', 'wtr_donated_to_combo_purpose_org',
       'amt_of_combo_donations', 'wtr_donated_to_org_for_needy',
       'amt_of_needy_donations', 'wtr_donated_to_health_org',
       'amt_of_health_donations', 'wtr_donated_to_edu_org',
       'amt_of_edu_donations', 't7a_wtr_donated_to_youth/family_organztn',
       't7b_wtr_donatd_to_arts/cultr/ethnic_assn',
       't7c_wtr_donatd_to_neighborhood/community',
       't7d_wtr_donated_to_environmental_org',
       't7e_wtr_donatd_to_intrnat_aid/wrld_peace',
       't7f_wtr_donated_to_other_charity', 'amt_of_other_donations', 'weight',
       'total_family_income', 'wealth_wo_equity', 'wealth_with_equity',
       'sequence_number'

In [205]:
psid = psid_sub.drop(columns = ['t7a_wtr_donated_to_youth/family_organztn',
       't7b_wtr_donatd_to_arts/cultr/ethnic_assn',
       't7c_wtr_donatd_to_neighborhood/community',
       't7d_wtr_donated_to_environmental_org',
       't7e_wtr_donatd_to_intrnat_aid/wrld_peace',
       't7f_wtr_donated_to_other_charity','wtr_donated_to_religious_org','wtr_donated_to_combo_purpose_org','wtr_donated_to_org_for_needy',
                                    'wtr_donated_to_edu_org','wtr_donated_to_cultural_org', 'wtr_donated_to_youth_org','wtr_donated_to_community_org','wtr_donated_to_env_org', 'wtr_donated_intl/peace_org',
                                   'wtr_donated_to_other'])

In [210]:
# drop individuals that missed any interview over 20 years
# to be modified if we are considering more recent / younger / more families
psid = psid.dropna()

In [297]:
psid.groupby("year").wtr_donated.describe()

ValueError: Grouper for 'year' not 1-dimensional

## Wide to Long

In [211]:
test = psid.copy(deep = True)

In [212]:
test

Unnamed: 0,year,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,...,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,weight,sequence_number,relation_to_rp,year.1,individual_id,wtr_donated-2021
3,1968,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,...,0.0,51560.0,2000.0,2000.0,23.045,81,10,2021,4006,5
6,1968,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,...,0.0,9528.0,0.0,0.0,30.877,1,10,2021,4031,5
14,1968,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,...,0.0,231000.0,7500.0,7500.0,14.996,1,10,2021,4039,5
16,1968,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,...,0.0,35344.0,0.0,30000.0,14.723,2,22,2021,4041,5
17,1968,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,...,0.0,64800.0,32000.0,32000.0,2.820,1,10,2021,4042,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,0.0,21726.0,-15082.0,-15082.0,3.069,1,10,2021,6872031,5
42997,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,0.0,21726.0,-15082.0,-15082.0,3.069,2,30,2021,6872034,5
42998,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,0.0,160413.0,63500.0,133500.0,2.247,3,30,2021,6872035,5
43001,1968,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,...,0.0,160413.0,63500.0,133500.0,2.247,4,30,2021,6872038,5


In [213]:
len(test.columns)

331

In [214]:
# rename the columns
# label each column with year
cols = list(test.columns)

y = 2001
for i in range(1, 27):
    cols[i] += '-' + str(y)
    
y = 2003
for i in range(27, len(cols) - 35, 30):
    for j in range(i, i + 30):
        cols[j] += '-' + str(y)
    y += 2

y = 2021
for i in range(297, 329):
    cols[i] += '-' + str(y)

In [215]:
cols

['year',
 'family_interview_id-2001',
 'psid_state_of_residence_code-2001',
 'family_composition_change-2001',
 'age-2001',
 'sex-2001',
 'number_of_children_in_fu-2001',
 '1968_family_identifier-2001',
 'marital_status-2001',
 'health_status-2001',
 'race-2001',
 'religion-2001',
 'wtr_donated-2001',
 'amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'wtr_donated_to_health_org-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001',
 'weight-2001',
 'total_family_income-2001',
 'wealth_wo_equity-2001',
 'wealth_with_equity-2001',
 'sequence_number-2001',
 'relation_to_rp-2001',
 'year-2001',
 'family_interview_id-2003',
 'psid_state_of_residence_code-2003',
 'family_composition_change-2003',
 '1968_family_identifier-2003',
 'age-2003',
 'sex-2003',
 'number_of_children_in_fu-2003',
 'marital_status-2003',
 'health_status-2003',
 'race-2003',
 'religion-2003',
 'wtr_donated-2003',
 'amt_of_re

In [216]:
test.columns = cols

In [217]:
del test["year"]

In [302]:
test['wtr_donated-2007'].value_counts()

1.0    8862
5.0    6633
0.0     143
9.0      15
8.0      12
Name: wtr_donated-2007, dtype: int64

AttributeError: 'DataFrame' object has no attribute 'year'

In [232]:
# delete records with DK/NA
test_nodk = test.copy(deep = True)

In [233]:
test_nodk

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,amt_of_other_donations-2021,total_family_income-2021,wealth_wo_equity-2021,wealth_with_equity-2021,weight-2021,sequence_number-2021,relation_to_rp-2021,year-2021,individual_id,wtr_donated-2021
3,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,0.0,51560.0,2000.0,2000.0,23.045,81,10,2021,4006,5
6,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,9528.0,0.0,0.0,30.877,1,10,2021,4031,5
14,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,0.0,231000.0,7500.0,7500.0,14.996,1,10,2021,4039,5
16,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,35344.0,0.0,30000.0,14.723,2,22,2021,4041,5
17,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,64800.0,32000.0,32000.0,2.820,1,10,2021,4042,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,21726.0,-15082.0,-15082.0,3.069,1,10,2021,6872031,5
42997,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,21726.0,-15082.0,-15082.0,3.069,2,30,2021,6872034,5
42998,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,160413.0,63500.0,133500.0,2.247,3,30,2021,6872035,5
43001,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,160413.0,63500.0,133500.0,2.247,4,30,2021,6872038,5


In [234]:
test_nodk[['amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001']] = test_nodk[['amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001']].replace([99999998, 99999999], np.nan)

test_nodk = test_nodk.dropna()

test_nodk['total_donation_amt-2001'] = test_nodk[['amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001']].sum(axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_nodk['total_donation_amt-2001'] = test_nodk[['amt_of_religious_donations-2001',


In [235]:
test_nodk

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,total_family_income-2021,wealth_wo_equity-2021,wealth_with_equity-2021,weight-2021,sequence_number-2021,relation_to_rp-2021,year-2021,individual_id,wtr_donated-2021,total_donation_amt-2001
3,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,51560.0,2000.0,2000.0,23.045,81,10,2021,4006,5,500.0
6,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,9528.0,0.0,0.0,30.877,1,10,2021,4031,5,0.0
14,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,231000.0,7500.0,7500.0,14.996,1,10,2021,4039,5,0.0
16,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,35344.0,0.0,30000.0,14.723,2,22,2021,4041,5,0.0
17,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,64800.0,32000.0,32000.0,2.820,1,10,2021,4042,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,21726.0,-15082.0,-15082.0,3.069,1,10,2021,6872031,5,1000.0
42997,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,21726.0,-15082.0,-15082.0,3.069,2,30,2021,6872034,5,1000.0
42998,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,160413.0,63500.0,133500.0,2.247,3,30,2021,6872035,5,1000.0
43001,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,160413.0,63500.0,133500.0,2.247,4,30,2021,6872038,5,1000.0


In [236]:
def clean_amt_six(df, year):
    df[['amt_of_religious_donations-' + year, 'amt_of_combo_donations-'+ year]] = df[['amt_of_religious_donations-' + year,'amt_of_combo_donations-'+ year]].replace([999998, 999999], np.nan)
    
    df = df.dropna()
    
    return df

In [237]:
def clean_amt_five(df, year):
    df[['amt_of_needy_donations-'+ year,
 'amt_of_health_donations-'+ year,
 'amt_of_edu_donations-'+ year,
 'amt_of_youth_donations-'+ year,
 'amt_of_cultural_donations-'+ year,
 'amt_of_community_donations-'+ year,
 'amt_of_env_donations-'+ year,
 'amt_of_other_donations-'+ year]] = df[[
 'amt_of_needy_donations-'+ year,
 'amt_of_health_donations-'+ year,
 'amt_of_edu_donations-'+ year,
 'amt_of_youth_donations-'+ year,
 'amt_of_cultural_donations-'+ year,
 'amt_of_community_donations-'+ year,
 'amt_of_env_donations-'+ year,
 'amt_of_other_donations-'+ year]].replace([99998, 99999], np.nan)
    
    df = df.dropna()
    
    return df

In [238]:
def cal_total_amt_nodk(df, year):
    df['total_donation_amt-' + year] = df[['amt_of_religious_donations-' + year,
 'amt_of_combo_donations-'+ year,
 'amt_of_needy_donations-'+ year,
 'amt_of_health_donations-'+ year,
 'amt_of_edu_donations-'+ year,
 'amt_of_youth_donations-'+ year,
 'amt_of_cultural_donations-'+ year,
 'amt_of_community_donations-'+ year,
 'amt_of_env_donations-'+ year,
 'amt_of_other_donations-'+ year]].sum(axis = 1)
    
    return df

In [239]:
for i in range(2003, 2020, 2):
    test_nodk = clean_amt_six(test_nodk, str(i))
    test_nodk = clean_amt_five(test_nodk, str(i))
    test_nodk = cal_total_amt_nodk(test_nodk, str(i))

test_nodk

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,total_donation_amt-2001,total_donation_amt-2003,total_donation_amt-2005,total_donation_amt-2007,total_donation_amt-2009,total_donation_amt-2011,total_donation_amt-2013,total_donation_amt-2015,total_donation_amt-2017,total_donation_amt-2019
3,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0
6,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0
14,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,0.0,50.0,600.0,275.0,0.0,300.0,0.0,0.0,0.0,0.0
16,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
17,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,1000.0,0.0,9100.0,0.0,30.0,300.0,0.0,0.0,0.0,0.0
42997,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,1000.0,0.0,9100.0,0.0,30.0,300.0,0.0,0.0,0.0,0.0
42998,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,1000.0,0.0,9100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43001,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,1000.0,0.0,9100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [240]:
test_nodk[['amt_of_religious_donations-2021',
 'amt_of_combo_donations-2021',
 'amt_of_needy_donations-2021',
 'amt_of_health_donations-2021',
 'amt_of_edu_donations-2021',
 'amt_of_youth_donations-2021',
 'amt_of_cultural_donations-2021',
 'amt_of_env_donations-2021','amt_of_other_donations-2021','mcovid4_dollar_amt_pandemic_donations-2021']] = test_nodk[['amt_of_religious_donations-2021',
 'amt_of_combo_donations-2021',
 'amt_of_needy_donations-2021',
 'amt_of_health_donations-2021',
 'amt_of_edu_donations-2021',
 'amt_of_youth_donations-2021',
 'amt_of_cultural_donations-2021',
 'amt_of_env_donations-2021','amt_of_other_donations-2021','mcovid4_dollar_amt_pandemic_donations-2021']].replace([999998, 999999], np.nan)

test_nodk = test_nodk.dropna()

test_nodk['total_donation_amt-2021'] = test_nodk[['amt_of_religious_donations-2021',
 'amt_of_combo_donations-2021',
 'amt_of_needy_donations-2021',
 'amt_of_health_donations-2021',
 'amt_of_edu_donations-2021',
 'amt_of_youth_donations-2021',
 'amt_of_cultural_donations-2021',
 'amt_of_env_donations-2021','amt_of_other_donations-2021','mcovid4_dollar_amt_pandemic_donations-2021']].sum(axis = 1)

test_nodk

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_nodk['total_donation_amt-2021'] = test_nodk[['amt_of_religious_donations-2021',


Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,total_donation_amt-2003,total_donation_amt-2005,total_donation_amt-2007,total_donation_amt-2009,total_donation_amt-2011,total_donation_amt-2013,total_donation_amt-2015,total_donation_amt-2017,total_donation_amt-2019,total_donation_amt-2021
3,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,0.0
6,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0
14,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,50.0,600.0,275.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0
16,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
17,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,30.0,300.0,0.0,0.0,0.0,0.0,0.0
42997,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,30.0,300.0,0.0,0.0,0.0,0.0,0.0
42998,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43001,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
# add values together without dropping DK or NA

def cal_total_amt(df, year):
    df[['amt_of_religious_donations-' + year,
 'amt_of_combo_donations-'+ year,
 'amt_of_needy_donations-'+ year,
 'amt_of_health_donations-'+ year,
 'amt_of_edu_donations-'+ year,
 'amt_of_youth_donations-'+ year,
 'amt_of_cultural_donations-'+ year,
 'amt_of_community_donations-'+ year,
 'amt_of_env_donations-'+ year,
 'amt_of_other_donations-'+ year]] = df[['amt_of_religious_donations-' + year,
 'amt_of_combo_donations-'+ year,
 'amt_of_needy_donations-'+ year,
 'amt_of_health_donations-'+ year,
 'amt_of_edu_donations-'+ year,
 'amt_of_youth_donations-'+ year,
 'amt_of_cultural_donations-'+ year,
 'amt_of_community_donations-'+ year,
 'amt_of_env_donations-'+ year,
 'amt_of_other_donations-'+ year]].replace([999998, 999999, 99998, 99999], 0)
    
    df['total_donation_amt-' + year] = df[['amt_of_religious_donations-' + year,
 'amt_of_combo_donations-'+ year,
 'amt_of_needy_donations-'+ year,
 'amt_of_health_donations-'+ year,
 'amt_of_edu_donations-'+ year,
 'amt_of_youth_donations-'+ year,
 'amt_of_cultural_donations-'+ year,
 'amt_of_community_donations-'+ year,
 'amt_of_env_donations-'+ year,
 'amt_of_other_donations-'+ year]].sum(axis = 1)

for i in range(2003, 2020, 2):
    cal_total_amt(test, str(i))

test[['amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001']] = test[['amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001']].replace([99999998, 99999999], 0)
test['total_donation_amt-2001'] = test[['amt_of_religious_donations-2001',
 'amt_of_combo_donations-2001',
 'amt_of_needy_donations-2001',
 'amt_of_health_donations-2001',
 'amt_of_edu_donations-2001',
 'amt_of_other_donations-2001']].sum(axis = 1)

test[['amt_of_religious_donations-2021',
 'amt_of_combo_donations-2021',
 'amt_of_needy_donations-2021',
 'amt_of_health_donations-2021',
 'amt_of_edu_donations-2021',
 'amt_of_youth_donations-2021',
 'amt_of_cultural_donations-2021',
 'amt_of_env_donations-2021','amt_of_other_donations-2021','mcovid4_dollar_amt_pandemic_donations-2021']] = test[['amt_of_religious_donations-2021',
 'amt_of_combo_donations-2021',
 'amt_of_needy_donations-2021',
 'amt_of_health_donations-2021',
 'amt_of_edu_donations-2021',
 'amt_of_youth_donations-2021',
 'amt_of_cultural_donations-2021',
 'amt_of_env_donations-2021','amt_of_other_donations-2021','mcovid4_dollar_amt_pandemic_donations-2021']].replace([999998, 999999], 0)
test['total_donation_amt-2021'] = test[['amt_of_religious_donations-2021',
 'amt_of_combo_donations-2021',
 'amt_of_needy_donations-2021',
 'amt_of_health_donations-2021',
 'amt_of_edu_donations-2021',
 'amt_of_youth_donations-2021',
 'amt_of_cultural_donations-2021',
 'amt_of_env_donations-2021','amt_of_other_donations-2021','mcovid4_dollar_amt_pandemic_donations-2021']].sum(axis = 1)

In [241]:
test_backup = test.copy(deep = True)

In [243]:
test = test_nodk.copy(deep = True)

In [244]:
test = test[test.columns.drop(list(test.filter(regex='amt_of_religious')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_combo')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_needy')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_edu')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_youth')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_cultural')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_env')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_community')))]
test = test[test.columns.drop(list(test.filter(regex='amt_of_other')))]
test = test[test.columns.drop(list(test.filter(regex='mcovid')))]

In [245]:
test

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,total_donation_amt-2003,total_donation_amt-2005,total_donation_amt-2007,total_donation_amt-2009,total_donation_amt-2011,total_donation_amt-2013,total_donation_amt-2015,total_donation_amt-2017,total_donation_amt-2019,total_donation_amt-2021
3,5599.0,15.0,0.0,43.0,1.0,0.0,4.0,4.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,0.0
6,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0
14,96.0,41.0,1.0,49.0,1.0,1.0,4.0,1.0,1.0,1.0,...,50.0,600.0,275.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0
16,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
17,285.0,41.0,4.0,28.0,1.0,2.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,30.0,300.0,0.0,0.0,0.0,0.0,0.0
42997,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,30.0,300.0,0.0,0.0,0.0,0.0,0.0
42998,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43001,3616.0,12.0,0.0,44.0,1.0,2.0,6872.0,1.0,4.0,2.0,...,0.0,9100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [268]:
test.individual_id

3           4006
6           4031
14          4039
16          4041
17          4042
          ...   
42995    6872031
42997    6872034
42998    6872035
43001    6872038
43013    6872183
Name: individual_id, Length: 15665, dtype: int64

In [246]:
df2021 = psid.iloc[:,-45:]

In [247]:
df2021

Unnamed: 0,amt_of_cultural_donations,amt_of_community_donations,amt_of_env_donations,amt_of_other_donations,total_family_income,wealth_wo_equity,wealth_with_equity,weight,sequence_number,relation_to_rp,...,amt_of_other_donations.1,total_family_income.1,wealth_wo_equity.1,wealth_with_equity.1,weight.1,sequence_number.1,relation_to_rp.1,year,individual_id,wtr_donated-2021
3,0.0,0.0,0.0,0.0,11028.0,1200.0,1200.0,35.912,1,10,...,0.0,51560.0,2000.0,2000.0,23.045,81,10,2021,4006,5
6,0.0,0.0,0.0,0.0,9380.0,0.0,0.0,23.899,1,10,...,0.0,9528.0,0.0,0.0,30.877,1,10,2021,4031,5
14,0.0,0.0,0.0,0.0,31000.0,14000.0,14000.0,14.996,1,10,...,0.0,231000.0,7500.0,7500.0,14.996,1,10,2021,4039,5
16,0.0,0.0,0.0,0.0,65000.0,0.0,20000.0,14.723,2,22,...,0.0,35344.0,0.0,30000.0,14.723,2,22,2021,4041,5
17,0.0,0.0,0.0,0.0,9380.0,0.0,0.0,23.899,51,30,...,0.0,64800.0,32000.0,32000.0,2.820,1,10,2021,4042,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42995,0.0,0.0,0.0,0.0,38000.0,34000.0,34000.0,3.068,1,10,...,0.0,21726.0,-15082.0,-15082.0,3.069,1,10,2021,6872031,5
42997,0.0,0.0,0.0,0.0,38000.0,34000.0,34000.0,3.068,2,30,...,0.0,21726.0,-15082.0,-15082.0,3.069,2,30,2021,6872034,5
42998,0.0,0.0,0.0,0.0,147800.0,120700.0,200700.0,2.247,3,30,...,0.0,160413.0,63500.0,133500.0,2.247,3,30,2021,6872035,5
43001,0.0,0.0,0.0,0.0,147800.0,120700.0,200700.0,2.247,4,30,...,0.0,160413.0,63500.0,133500.0,2.247,4,30,2021,6872038,5


In [115]:
df2021.to_csv("psid_2021.csv", index = False)

In [248]:
tutorial = test[(test['sequence_number-2001'] >= 1) & (test['sequence_number-2001'] <= 20) & (test['relation_to_rp-2001'] == 10) & (test['sequence_number-2001'] == 1) &
                (test['sequence_number-2003'] >= 1) & (test['sequence_number-2003'] <= 20) & (test['relation_to_rp-2003'] == 10) & (test['sequence_number-2003'] == 1) &
                (test['sequence_number-2005'] >= 1) & (test['sequence_number-2005'] <= 20) & (test['relation_to_rp-2005'] == 10) & (test['sequence_number-2005'] == 1) &
                (test['sequence_number-2007'] >= 1) & (test['sequence_number-2007'] <= 20) & (test['relation_to_rp-2007'] == 10) & (test['sequence_number-2007'] == 1) &
                (test['sequence_number-2009'] >= 1) & (test['sequence_number-2009'] <= 20) & (test['relation_to_rp-2009'] == 10) & (test['sequence_number-2009'] == 1) &
                (test['sequence_number-2011'] >= 1) & (test['sequence_number-2011'] <= 20) & (test['relation_to_rp-2011'] == 10) & (test['sequence_number-2011'] == 1) &
                (test['sequence_number-2013'] >= 1) & (test['sequence_number-2013'] <= 20) & (test['relation_to_rp-2013'] == 10) & (test['sequence_number-2013'] == 1) &
                (test['sequence_number-2015'] >= 1) & (test['sequence_number-2015'] <= 20) & (test['relation_to_rp-2015'] == 10) & (test['sequence_number-2015'] == 1) &
                (test['sequence_number-2017'] >= 1) & (test['sequence_number-2017'] <= 20) & (test['relation_to_rp-2017'] == 10) & (test['sequence_number-2017'] == 1) &
                (test['sequence_number-2019'] >= 1) & (test['sequence_number-2019'] <= 20) & (test['relation_to_rp-2019'] == 10) & (test['sequence_number-2019'] == 1) &
                (test['sequence_number-2021'] >= 1) & (test['sequence_number-2021'] <= 20) & (test['relation_to_rp-2021'] == 10) & (test['sequence_number-2021'] == 1)]

In [249]:
tutorial

Unnamed: 0,family_interview_id-2001,psid_state_of_residence_code-2001,family_composition_change-2001,age-2001,sex-2001,number_of_children_in_fu-2001,1968_family_identifier-2001,marital_status-2001,health_status-2001,race-2001,...,total_donation_amt-2003,total_donation_amt-2005,total_donation_amt-2007,total_donation_amt-2009,total_donation_amt-2011,total_donation_amt-2013,total_donation_amt-2015,total_donation_amt-2017,total_donation_amt-2019,total_donation_amt-2021
67,1448.0,41.0,0.0,38.0,1.0,2.0,5.0,1.0,2.0,1.0,...,350.0,150.0,100.0,125.0,0.0,0.0,0.0,0.0,300.0,0.0
99,6438.0,26.0,1.0,46.0,1.0,1.0,6.0,1.0,1.0,1.0,...,5000.0,6700.0,8000.0,15450.0,13250.0,7100.0,15500.0,33000.0,32000.0,35000.0
114,263.0,12.0,1.0,56.0,1.0,0.0,6.0,1.0,1.0,1.0,...,0.0,0.0,180.0,150.0,125.0,0.0,0.0,100.0,0.0,0.0
126,3198.0,34.0,0.0,45.0,2.0,0.0,7.0,4.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,3327.0,34.0,0.0,24.0,1.0,0.0,7.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42811,2219.0,4.0,0.0,67.0,1.0,0.0,6845.0,1.0,4.0,2.0,...,840.0,420.0,700.0,350.0,50.0,0.0,500.0,600.0,600.0,900.0
42866,3434.0,39.0,0.0,42.0,2.0,1.0,6853.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42878,4711.0,4.0,0.0,42.0,1.0,1.0,6854.0,2.0,3.0,2.0,...,0.0,0.0,120.0,210.0,120.0,0.0,200.0,0.0,0.0,200.0
42914,4961.0,19.0,1.0,42.0,2.0,1.0,6862.0,1.0,3.0,2.0,...,700.0,2600.0,2600.0,3500.0,4575.0,7000.0,4500.0,2300.0,4000.0,12000.0


In [264]:
long_notna = pd.wide_to_long(tutorial, ['family_interview_id',
       'psid_state_of_residence_code', 'family_composition_change', 'age',
       'sex', 'number_of_children_in_fu', '1968_family_identifier',
       'marital_status', 'health_status', 'race', 'religion', 'wtr_donated',
       'wtr_donated_to_health_org', 'amt_of_health_donations', 'total_donation_amt', 
       'total_family_income', 'wealth_wo_equity',
       'wealth_with_equity', 'sequence_number','weight',
       'relation_to_rp',  'year'],  i = ["individual_id"], j = "YEAR", sep = '-')

In [265]:
long_notna

Unnamed: 0_level_0,Unnamed: 1_level_0,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,race,...,wtr_donated_to_health_org,amt_of_health_donations,total_donation_amt,total_family_income,wealth_wo_equity,wealth_with_equity,sequence_number,weight,relation_to_rp,year
individual_id,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5003,2001,1448.0,41.0,0.0,38.0,1.0,2.0,5.0,1.0,2.0,1.0,...,1.0,50.0,500.0,79200.0,104000.0,154000.0,1,22.220,10,2001
6006,2001,6438.0,26.0,1.0,46.0,1.0,1.0,6.0,1.0,1.0,1.0,...,1.0,250.0,5000.0,168000.0,632500.0,782500.0,1,21.894,10,2001
6170,2001,263.0,12.0,1.0,56.0,1.0,0.0,6.0,1.0,1.0,1.0,...,0.0,0.0,0.0,90200.0,19400.0,48400.0,1,26.424,10,2001
7004,2001,3198.0,34.0,0.0,45.0,2.0,0.0,7.0,4.0,3.0,1.0,...,0.0,0.0,0.0,26338.0,500.0,7500.0,1,60.096,10,2001
7035,2001,3327.0,34.0,0.0,24.0,1.0,0.0,7.0,1.0,4.0,1.0,...,0.0,0.0,0.0,19716.0,0.0,0.0,1,14.282,10,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6845170,2021,5580.0,4.0,0.0,87.0,1.0,0.0,6845.0,1.0,5.0,2.0,...,0.0,0.0,900.0,20352.0,-1000.0,-1000.0,1,4.827,10,2021
6853003,2021,5931.0,39.0,0.0,62.0,2.0,0.0,6853.0,2.0,3.0,2.0,...,0.0,0.0,0.0,37600.0,250.0,950.0,1,4.889,10,2021
6854004,2021,137.0,4.0,0.0,62.0,1.0,1.0,6854.0,2.0,2.0,2.0,...,5.0,0.0,200.0,32780.0,20000.0,20000.0,1,2.112,10,2021
6862008,2021,7563.0,19.0,0.0,62.0,2.0,0.0,6862.0,4.0,3.0,2.0,...,0.0,0.0,12000.0,26000.0,-30100.0,54900.0,1,2.283,10,2021


In [269]:
long_notna.index

MultiIndex([(   5003, 2001),
            (   6006, 2001),
            (   6170, 2001),
            (   7004, 2001),
            (   7035, 2001),
            (  10003, 2001),
            (  10006, 2001),
            (  10007, 2001),
            (  10021, 2001),
            (  14004, 2001),
            ...
            (6840004, 2021),
            (6841030, 2021),
            (6845002, 2021),
            (6845005, 2021),
            (6845006, 2021),
            (6845170, 2021),
            (6853003, 2021),
            (6854004, 2021),
            (6862008, 2021),
            (6864177, 2021)],
           names=['individual_id', 'YEAR'], length=23859)

In [255]:
long_notna = pd.wide_to_long(tutorial_notna, ['family_interview_id',
       'psid_state_of_residence_code', 'family_composition_change', 'age',
       'sex', 'number_of_children_in_fu', '1968_family_identifier',
       'marital_status', 'health_status', 'race', 'religion', 'wtr_donated',
       'wtr_donated_to_health_org', 'amt_of_health_donations', 'total_donation_amt', 
       'total_family_income', 'wealth_wo_equity','weight',
       'wealth_with_equity', 'sequence_number',
       'relation_to_rp',  'year'],  i = ["individual_id"], j = "YEAR", sep = '-')

In [256]:
long_notna

Unnamed: 0_level_0,Unnamed: 1_level_0,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,race,...,wtr_donated_to_health_org,amt_of_health_donations,total_donation_amt,total_family_income,wealth_wo_equity,weight,wealth_with_equity,sequence_number,relation_to_rp,year
individual_id,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5003,2001,1448.0,41.0,0.0,38.0,1.0,2.0,5.0,1.0,2.0,1.0,...,1.0,50.0,500.0,79200.0,104000.0,22.220,154000.0,1,10,2001
6006,2001,6438.0,26.0,1.0,46.0,1.0,1.0,6.0,1.0,1.0,1.0,...,1.0,250.0,5000.0,168000.0,632500.0,21.894,782500.0,1,10,2001
6170,2001,263.0,12.0,1.0,56.0,1.0,0.0,6.0,1.0,1.0,1.0,...,0.0,0.0,0.0,90200.0,19400.0,26.424,48400.0,1,10,2001
7004,2001,3198.0,34.0,0.0,45.0,2.0,0.0,7.0,4.0,3.0,1.0,...,0.0,0.0,0.0,26338.0,500.0,60.096,7500.0,1,10,2001
7035,2001,3327.0,34.0,0.0,24.0,1.0,0.0,7.0,1.0,4.0,1.0,...,0.0,0.0,0.0,19716.0,0.0,14.282,0.0,1,10,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6845170,2021,5580.0,4.0,0.0,87.0,1.0,0.0,6845.0,1.0,5.0,2.0,...,0.0,0.0,900.0,20352.0,-1000.0,4.827,-1000.0,1,10,2021
6853003,2021,5931.0,39.0,0.0,62.0,2.0,0.0,6853.0,2.0,3.0,2.0,...,0.0,0.0,0.0,37600.0,250.0,4.889,950.0,1,10,2021
6854004,2021,137.0,4.0,0.0,62.0,1.0,1.0,6854.0,2.0,2.0,2.0,...,5.0,0.0,200.0,32780.0,20000.0,2.112,20000.0,1,10,2021
6862008,2021,7563.0,19.0,0.0,62.0,2.0,0.0,6862.0,4.0,3.0,2.0,...,0.0,0.0,12000.0,26000.0,-30100.0,2.283,54900.0,1,10,2021


In [126]:
long_notna.to_csv("long_notna.csv")

In [127]:
long_notna.race.value_counts()

1.0    18541
2.0     9389
7.0      660
4.0      483
5.0      293
3.0      174
9.0      143
6.0        4
0.0        2
Name: race, dtype: int64

In [129]:
import cpi

In [162]:
long_notna.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 29689 entries, (5003, 2001) to (6864177, 2021)
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   family_interview_id           29689 non-null  float64
 1   psid_state_of_residence_code  29689 non-null  float64
 2   family_composition_change     29689 non-null  float64
 3   age                           29689 non-null  float64
 4   sex                           29689 non-null  float64
 5   number_of_children_in_fu      29689 non-null  float64
 6   1968_family_identifier        29689 non-null  float64
 7   marital_status                29689 non-null  float64
 8   health_status                 29689 non-null  float64
 9   race                          29689 non-null  float64
 10  religion                      29689 non-null  float64
 11  wtr_donated                   29689 non-null  float64
 12  wtr_donated_to_health_org     29689 non

In [257]:
long_adj = long_notna

In [258]:
long_adj.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23859 entries, (5003, 2001) to (6864177, 2021)
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   family_interview_id           23859 non-null  float64
 1   psid_state_of_residence_code  23859 non-null  float64
 2   family_composition_change     23859 non-null  float64
 3   age                           23859 non-null  float64
 4   sex                           23859 non-null  float64
 5   number_of_children_in_fu      23859 non-null  float64
 6   1968_family_identifier        23859 non-null  float64
 7   marital_status                23859 non-null  float64
 8   health_status                 23859 non-null  float64
 9   race                          23859 non-null  float64
 10  religion                      23859 non-null  float64
 11  wtr_donated                   23859 non-null  float64
 12  wtr_donated_to_health_org     23859 non

In [259]:
# adjust for inflation
for i in long_adj.index:
    #print(int(long_adj.loc[i, 'year']) - 1)
    long_adj.loc[i, 'year'] = int(long_adj.loc[i, 'year']) - 1
    long_adj.loc[i, 'health_donation_adj'] = cpi.inflate(long_adj.loc[i, 'amt_of_health_donations'], long_adj.loc[i, 'year'], to = 2020)
    long_adj.loc[i, 'total_donation_adj'] = cpi.inflate(long_adj.loc[i, 'total_donation_amt'], long_adj.loc[i, 'year'], to = 2020)
    long_adj.loc[i, 'family_income_adj'] = cpi.inflate(long_adj.loc[i, 'total_family_income'], long_adj.loc[i, 'year'], to = 2020)
    long_adj.loc[i, 'wealth_adj'] = cpi.inflate(long_adj.loc[i, 'wealth_with_equity'], long_adj.loc[i, 'year'], to = 2020)

In [263]:
long_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,race,...,wealth_wo_equity,weight,wealth_with_equity,sequence_number,relation_to_rp,year,health_donation_adj,total_donation_adj,family_income_adj,wealth_adj
individual_id,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5003,2001,1448.0,41.0,0.0,38.0,1.0,2.0,5.0,1.0,2.0,1.0,...,104000.0,22.220,154000.0,1,10,2000,75.148374,751.483740,119035.024390,2.314570e+05
6006,2001,6438.0,26.0,1.0,46.0,1.0,1.0,6.0,1.0,1.0,1.0,...,632500.0,21.894,782500.0,1,10,2000,375.741870,7514.837398,252498.536585,1.176072e+06
6170,2001,263.0,12.0,1.0,56.0,1.0,0.0,6.0,1.0,1.0,1.0,...,19400.0,26.424,48400.0,1,10,2000,0.000000,0.000000,135567.666667,7.274363e+04
7004,2001,3198.0,34.0,0.0,45.0,2.0,0.0,7.0,4.0,3.0,1.0,...,500.0,60.096,7500.0,1,10,2000,0.000000,0.000000,39585.157480,1.127226e+04
7035,2001,3327.0,34.0,0.0,24.0,1.0,0.0,7.0,1.0,4.0,1.0,...,0.0,14.282,0.0,1,10,2000,0.000000,0.000000,29632.506829,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6845170,2021,5580.0,4.0,0.0,87.0,1.0,0.0,6845.0,1.0,5.0,2.0,...,-1000.0,4.827,-1000.0,1,10,2020,0.000000,900.000000,20352.000000,-1.000000e+03
6853003,2021,5931.0,39.0,0.0,62.0,2.0,0.0,6853.0,2.0,3.0,2.0,...,250.0,4.889,950.0,1,10,2020,0.000000,0.000000,37600.000000,9.500000e+02
6854004,2021,137.0,4.0,0.0,62.0,1.0,1.0,6854.0,2.0,2.0,2.0,...,20000.0,2.112,20000.0,1,10,2020,0.000000,200.000000,32780.000000,2.000000e+04
6862008,2021,7563.0,19.0,0.0,62.0,2.0,0.0,6862.0,4.0,3.0,2.0,...,-30100.0,2.283,54900.0,1,10,2020,0.000000,12000.000000,26000.000000,5.490000e+04


In [277]:
long_adj = long_adj.drop(columns = ['sequence_number', 'relation_to_rp'])

In [278]:
long_adj.to_csv("long_format_adjusted_dk_dropped.csv")

In [280]:
long_adj.total_family_income.describe()

count    2.385900e+04
mean     8.140584e+04
std      9.264009e+04
min     -2.825000e+05
25%      3.120000e+04
50%      6.120000e+04
75%      1.040000e+05
max      3.316000e+06
Name: total_family_income, dtype: float64

In [279]:
long_adj.groupby("year").mean()

Unnamed: 0_level_0,family_interview_id,psid_state_of_residence_code,family_composition_change,age,sex,number_of_children_in_fu,1968_family_identifier,marital_status,health_status,race,...,amt_of_health_donations,total_donation_amt,total_family_income,wealth_wo_equity,weight,wealth_with_equity,health_donation_adj,total_donation_adj,family_income_adj,wealth_adj
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,3655.303366,23.789765,0.907792,40.985708,1.2213,1.043338,2754.940526,1.793453,2.27432,1.734901,...,41.597049,1169.277086,64105.197326,113784.203781,21.625833,163492.134624,62.519012,1757.385435,96348.026859,245723.361523
2002,3549.158598,23.779622,0.363762,42.989396,1.2213,1.011987,2754.940526,1.798525,2.289995,1.737206,...,40.997234,1288.231443,66753.356846,144967.088981,22.418986,208428.900876,58.980184,1853.298877,96033.924618,299853.764673
2004,3610.375288,23.66805,0.360535,44.929461,1.2213,0.958506,2754.940526,1.801291,2.394652,1.556939,...,52.621946,1467.032273,71887.980175,154719.506685,22.350665,242081.392347,72.09708,2009.974005,98493.382939,331674.575091
2006,3684.579069,24.109267,0.328262,46.96035,1.2213,0.903181,2754.940526,1.795297,2.408483,1.551406,...,58.578147,1616.066851,77825.04887,200188.196404,23.560111,303820.268787,75.20173,2074.681934,99910.608746,390039.819371
2008,3807.449055,24.123559,0.308437,48.958967,1.2213,0.843246,2754.940526,1.819733,2.475334,1.555556,...,49.135546,1660.625634,83342.804979,245189.274781,23.557128,330526.295989,59.064759,1996.201544,100184.552466,397318.389392
2010,3896.884278,24.118488,0.310281,50.97326,1.2213,0.739972,2754.940526,1.847856,2.53527,1.555556,...,59.299677,1620.433379,79887.914707,208873.483172,25.617111,293555.865376,70.382878,1923.294857,94819.088185,348421.90572
2012,3842.662517,24.210696,0.2568,52.953896,1.2213,0.66805,2754.940526,1.844629,2.577225,1.555556,...,59.361918,1597.732596,85242.279391,214085.475795,25.740698,301118.380821,66.916023,1801.052165,96089.791421,339437.22074
2014,3754.833564,24.153527,0.282619,54.976487,1.2213,0.572153,2754.940526,1.844629,2.598432,1.555556,...,64.42047,1635.798064,87211.815583,283839.010604,29.420144,387617.466113,70.427507,1788.331866,95344.084562,423761.760029
2016,4011.644537,24.061319,0.265099,56.986169,1.2213,0.487321,2754.940526,1.860304,2.668972,1.55325,...,53.880129,1808.196865,91344.028585,294319.52651,29.577672,414365.337022,58.101514,1949.864957,98500.62449,446829.914294
2018,4341.037344,24.128631,0.228216,58.954357,1.2213,0.406639,2754.940526,1.869064,2.715998,1.55325,...,62.544952,1790.182573,92848.369295,318023.71692,33.08625,458723.372983,64.46384,1845.105639,95696.971034,472797.074096


In [288]:
long_adj.groupby("year")['wtr_donated'].value_counts()# / long_adj.groupby("year")['wtr_donated'].count()

year  wtr_donated
2000  1.0            1371
      5.0             791
      0.0               5
      8.0               1
      9.0               1
2002  1.0            1409
      5.0             751
      0.0               8
      9.0               1
2004  1.0            1410
      5.0             746
      0.0              11
      8.0               1
      9.0               1
2006  1.0            1394
      5.0             762
      0.0              11
      9.0               2
2008  1.0            1402
      5.0             752
      0.0              12
      9.0               3
2010  1.0            1379
      5.0             774
      0.0              13
      9.0               2
      8.0               1
2012  1.0            1330
      5.0             822
      0.0              13
      9.0               3
      8.0               1
2014  1.0            1274
      5.0             877
      0.0              14
      8.0               3
      9.0               1
2016  1.0           

In [286]:
long_adj.groupby("year")['wtr_donated_to_health_org'].value_counts()

year  wtr_donated_to_health_org
2000  5.0                           997
      0.0                           798
      1.0                           374
2002  5.0                          1023
      0.0                           760
      1.0                           385
      8.0                             1
2004  5.0                          1007
      0.0                           759
      1.0                           402
      8.0                             1
2006  5.0                           970
      0.0                           775
      1.0                           422
      8.0                             2
2008  5.0                           946
      0.0                           767
      1.0                           455
      9.0                             1
2010  5.0                           943
      0.0                           790
      1.0                           436
2012  5.0                           908
      0.0                           839
      1.

In [289]:
long_adj['wtr_donated_to_health_org'].value_counts()

0.0    10009
5.0     9666
1.0     4174
8.0        6
9.0        4
Name: wtr_donated_to_health_org, dtype: int64

In [291]:
len(long_adj)

23859

In [293]:
2169*11

23859

In [294]:
long_adj['wtr_donated'].value_counts()

1.0    14157
5.0     9553
0.0      117
9.0       23
8.0        9
Name: wtr_donated, dtype: int64