In [10]:
import pandas as pd
df = pd.read_csv("ConsumForCAPM.csv")

### Data Cleaning

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,HH_ID,STATE,HR,DISTRICT,REGION_TYPE,STRATUM,PSU_ID,MONTH_SLOT,MONTH,...,HH_NR_MS,HH_NR_FOR_COUNTRY_MS,HH_NR_FOR_STATE_MS,AGE_GROUP,OCCUPATION_GROUP,EDU_GROUP,GENDER_GROUP,SIZE_GROUP,TOT_EXP,ADJ_TOT_EXP
0,1,53877505,Jammu & Kashmir,HR 1,Anantnag,URBAN,HR 1_URBAN_S,PSU_ID_2,Apr 2014,Jan 2014,...,1.075342,1.075342,1.075342,Youngsters - dominant,Wage Labourers,Households of some literates,Female Majority,6 Members,6543,9463
1,2,43406519,Jammu & Kashmir,HR 1,Anantnag,URBAN,HR 1_URBAN_S,PSU_ID_2,Apr 2014,Jan 2014,...,1.075342,1.075342,1.075342,Others households of Grown-ups,Business & Salaried Employees,Graduates majority household,Male Majority,8-10 Members,15405,19705
2,3,65955398,Jammu & Kashmir,HR 1,Anantnag,URBAN,HR 1_URBAN_S,PSU_ID_2,Apr 2014,Jan 2014,...,1.075342,1.075342,1.075342,Grown-up - dominant,Self-employed Professionals,Households of all illiterates,Female Majority,3 Members,5510,8075
3,4,46232212,Jammu & Kashmir,HR 1,Anantnag,URBAN,HR 1_URBAN_S,PSU_ID_2,Apr 2014,Jan 2014,...,1.075342,1.075342,1.075342,Balanced households with no Seniors,Wage Labourers,Households of all illiterates,Male Dominated,4 Members,8400,11287
4,5,81759227,Jammu & Kashmir,HR 1,Anantnag,URBAN,HR 1_URBAN_S,PSU_ID_2,Apr 2014,Jan 2014,...,1.075342,1.075342,1.075342,Children - dominant,Wage Labourers,Households of all illiterates,Female Dominated,5 Members,7436,9288


In [12]:
# Print all column names
print(df.columns.tolist())

# Optionally, print one per line for readability
for col in df.columns:
    print(col)

['Unnamed: 0', 'HH_ID', 'STATE', 'HR', 'DISTRICT', 'REGION_TYPE', 'STRATUM', 'PSU_ID', 'MONTH_SLOT', 'MONTH', 'RESPONSE_STATUS', 'NR_REASON', 'FAMILY_SHIFTED', 'R_HH_WGT_MS', 'R_HH_WGT_FOR_COUNTRY_MS', 'R_HH_WGT_FOR_STATE_MS', 'HH_NR_MS', 'HH_NR_FOR_COUNTRY_MS', 'HH_NR_FOR_STATE_MS', 'AGE_GROUP', 'OCCUPATION_GROUP', 'EDU_GROUP', 'GENDER_GROUP', 'SIZE_GROUP', 'TOT_EXP', 'ADJ_TOT_EXP']
Unnamed: 0
HH_ID
STATE
HR
DISTRICT
REGION_TYPE
STRATUM
PSU_ID
MONTH_SLOT
MONTH
RESPONSE_STATUS
NR_REASON
FAMILY_SHIFTED
R_HH_WGT_MS
R_HH_WGT_FOR_COUNTRY_MS
R_HH_WGT_FOR_STATE_MS
HH_NR_MS
HH_NR_FOR_COUNTRY_MS
HH_NR_FOR_STATE_MS
AGE_GROUP
OCCUPATION_GROUP
EDU_GROUP
GENDER_GROUP
SIZE_GROUP
TOT_EXP
ADJ_TOT_EXP


In [13]:
# Specify columns to KEEP (edit this list)
keep_cols = ['MONTH_SLOT', 'R_HH_WGT_FOR_COUNTRY_MS','TOT_EXP', 'ADJ_TOT_EXP']  # using existing variable `col`; replace or extend with e.g. ['ADJ_TOT_EXP', 'TOT_EXP']

# Validate and keep only available columns
missing = [c for c in keep_cols if c not in df.columns]
if missing:
    print(f"Warning: these columns not found and will be ignored: {missing}")
keep_cols = [c for c in keep_cols if c in df.columns]

# Create a new dataframe with only the kept columns (or overwrite `df` if you prefer)
df_subset = df.loc[:, keep_cols].copy()

# If you want to replace the original df instead, uncomment the next line:
# df = df_subset

In [18]:
# sample 100 random rows (falls back to full dataframe if <100 rows)
n = 100
n = min(n, len(df_subset))
sampled = df_subset.sample(n=n, random_state=42).reset_index(drop=True)

# save to CSV
sampled.to_csv("df_subset_random100.csv", index=False)

# display a quick preview
sampled.head()

Unnamed: 0,MONTH_SLOT,R_HH_WGT_FOR_COUNTRY_MS,TOT_EXP,ADJ_TOT_EXP
0,Aug 2024,870.73799,14297,12862
1,Nov 2021,6874.446652,6599,7730
2,Jun 2023,533.976977,-99,-99
3,Jun 2014,5471.285733,5558,5056
4,Jan 2023,721.148709,10060,12010


### Data Aggregation

In [22]:
import duckdb
import pandas as pd

# Assuming your full dataset is loaded as df_subset
con = duckdb.connect()

query = """
SELECT 
    MONTH_SLOT,
    SUM(ADJ_TOT_EXP * R_HH_WGT_FOR_COUNTRY_MS) / SUM(R_HH_WGT_FOR_COUNTRY_MS) AS weighted_avg_consumption
FROM df_subset
GROUP BY MONTH_SLOT
ORDER BY MONTH_SLOT
"""

national_consumption_ts = con.execute(query).df()

print(national_consumption_ts.head())
national_consumption_ts.to_csv("national_consumption_timeseries.csv", index=False)


  MONTH_SLOT  weighted_avg_consumption
0   Apr 2014               7208.683747
1   Apr 2015               7149.305019
2   Apr 2016               8498.080296
3   Apr 2017               8213.528465
4   Apr 2018               9823.236093
