In [None]:
import os
os.chdir("/home/youhui/Git/Synth")
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd

from utils.synth import *
from utils.competitors import *

import matplotlib as mpl

mpl.rc('text', usetex=True)
mpl.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'
plt.rcParams.update({
    "font.family": "serif",
})

methods = [
    'SC',
    'Lasso', 
    'Ridge', 
    'Elastic Net',
    r'$L_{\infty}$', 
    r'$L_1 + L_{\infty}$'
] 

# colors = [
#     '#E69F00',  # Orange
#     '#56B4E9',  # Sky Blue
#     '#CC79A7',  # Reddish Purple
#     '#F0E442',  # Yellow
#     '#0072B2',  # Blue
#     '#D55E00',  # Vermilion
#     '#009E73',  # Bluish Green
#     '#999999'
# ]

colors = [
    '#E69F00',  # Orange
    '#E69F00',  # Orange
    '#0000FF',  # Blue
    '#0000FF',  # Blue
    '#009E73',  # Green
    '#009E73',  # Green
]
line_styles = [
    '-',    # Solid
    '--',   # Dashed
    # '-.',   # Dash-dot
]

In [None]:
START_TIME = 1970
INTERVENTION_TIME = 1989
STOP_TIME = 2001

In [None]:
df = pd.read_stata('Tobacco/synth_smoking.dta')

# deal with predictors
predictors_avg_df = df[(df['year'] >= 1980) & (df['year'] <= 1988)].groupby('state', observed=False).mean()
predictors_avg_df.drop(columns='year', inplace=True)  # Drop the 'year' column after averaging
predictors_avg_df = predictors_avg_df.drop(columns='cigsale')
beer_avg = df[(df['year'] >= 1984) & (df['year'] <= 1988)].groupby('state', observed=False)['beer'].mean()
predictors_avg_df['beer'] = beer_avg
cigsale1975 = df[df['year'] == 1975].set_index('state')['cigsale']
cigsale1980 = df[df['year'] == 1980].set_index('state')['cigsale']
cigsale1985 = df[df['year'] == 1985].set_index('state')['cigsale']
# Rename the series for clarity when adding them to the DataFrame
cigsale1975.name = 'cigsale1975'
cigsale1980.name = 'cigsale1980'
cigsale1985.name = 'cigsale1985'
# Add these series to the predictors DataFrame
predictors_avg_df = predictors_avg_df.join(cigsale1975)
predictors_avg_df = predictors_avg_df.join(cigsale1980)
predictors_avg_df = predictors_avg_df.join(cigsale1985)
predictors_avg_df = predictors_avg_df.T  # Transpose if your predictors are currently in rows
california_predictors_df = predictors_avg_df[['California']]
non_california_predictors_df = predictors_avg_df.drop(columns='California')

# deal with responses
response_df = df.pivot(index='year', columns='state', values='cigsale')
california_response_df = response_df[['California']]
non_california_response_df = response_df.drop(columns='California')
california_pre_1988_df = california_response_df[california_response_df.index <= 1988]
california_post_1988_df = california_response_df[california_response_df.index > 1988]
non_california_pre_1988_df = non_california_response_df[non_california_response_df.index <= 1988]
non_california_post_1988_df = non_california_response_df[non_california_response_df.index > 1988]

In [None]:
X0 = non_california_predictors_df
X1 = california_predictors_df
Z0 = non_california_pre_1988_df
Z1 = california_pre_1988_df
Y0 = non_california_post_1988_df
Y1 = california_post_1988_df

In [None]:
mean_outcomes = np.vstack([Z0, Y0]).mean(axis=1)
CA_outcomes = np.vstack([Z1, Y1]).flatten()
fig = plt.figure(figsize=(5.5,4.5))
# plt.plot(range(START_TIME,STOP_TIME),mean_outcomes, 'r--', label="Rest of the U.S.");
plt.plot(range(START_TIME,STOP_TIME),CA_outcomes, 'b-', label="California");

plt.ylabel('Per-capita Cigarette Sales', fontsize=16)
plt.xlabel('Year', fontsize=16)
# plt.legend(loc='lower left', fontsize=14)
# plt.title("Figure 1: Trends in per-capita cigarette sales: California vs. the rest of the United States")
plt.axvline(INTERVENTION_TIME)
plt.text(x=INTERVENTION_TIME-7.5, y=40, s='Passage of Proposition 99', fontsize=14)
plt.xlim([START_TIME, STOP_TIME-1])
plt.ylim([0, 150])
plt.grid()
plt.xticks(fontsize=15)  
plt.yticks(fontsize=15)  
plt.show()
# fig.savefig("name", dpi=300)

In [None]:
mean_outcomes = np.vstack([Z0, Y0]).mean(axis=1)
CA_outcomes = np.vstack([Z1, Y1]).flatten()
fig = plt.figure(figsize=(5.5,4.5))
plt.plot(range(START_TIME,STOP_TIME),mean_outcomes, 'r--', label="Rest of the U.S.");
plt.plot(range(START_TIME,STOP_TIME),CA_outcomes, 'b-', label="California");

plt.ylabel('Per-capita Cigarette Sales', fontsize=16)
plt.xlabel('Year', fontsize=16)
plt.legend(loc='lower left', fontsize=14)
# plt.title("Figure 1: Trends in per-capita cigarette sales: California vs. the rest of the United States")
plt.axvline(INTERVENTION_TIME)
plt.text(x=INTERVENTION_TIME-7.5, y=38, s='Passage of Proposition 99', fontsize=14)
plt.xlim([START_TIME, STOP_TIME-1])
plt.ylim([0, 150])
plt.grid()
plt.xticks(fontsize=15)  
plt.yticks(fontsize=15)  
plt.show()
# fig.savefig("name", dpi=300)

In [None]:
T0 = INTERVENTION_TIME - START_TIME
T1 = STOP_TIME - INTERVENTION_TIME
J = Y0.shape[1]
n_folds = 10
test_size = 0.4
# Print T0, T1, and J
print("T0:", T0, "T1:", T1, "J:", J)
Y1_pre = Z1.to_numpy().astype('float64')
Y0_pre = Z0.to_numpy().astype('float64')
Y1_post = Y1.to_numpy().astype('float64')
Y0_post = Y0.to_numpy().astype('float64')

## looking for the best alpha and K's.

alpha_inf, lam_inf = param_selector(Y1_pre[:,0], Y0_pre, method='inf', n_folds=n_folds)
alpha_l1, lam_l1 = param_selector(Y1_pre[:,0], Y0_pre, method='l1', n_folds=n_folds)
alpha_l2, lam_l2 = param_selector(Y1_pre[:,0], Y0_pre, method='l2', n_folds=n_folds)
alpha_l1_inf, lam_l1_inf = param_selector(Y1_pre[:,0], Y0_pre, method='l1-inf', fixed_alpha=0.1, n_folds=n_folds)
alpha_l1_l2, lam_l1_l2 = param_selector(Y1_pre[:,0], Y0_pre, method='l1-l2', fixed_alpha=0.1, n_folds=n_folds)

# alpha_inf, lam_inf = param_selector(Y1_pre[:,0], Y0_pre, method='inf', test_size=test_size)
# alpha_l1, lam_l1 = param_selector(Y1_pre[:,0], Y0_pre, method='l1', test_size=test_size)
# alpha_l2, lam_l2 = param_selector(Y1_pre[:,0], Y0_pre, method='l2', test_size=test_size)
# alpha_l1_inf, lam_l1_inf = param_selector(Y1_pre[:,0], Y0_pre, method='l1-inf', fixed_alpha=0.1, test_size=test_size)
# alpha_l1_l2, lam_l1_l2 = param_selector(Y1_pre[:,0], Y0_pre, method='l1-l2', fixed_alpha=0.1, test_size=test_size)

# Print statements
print("L-inf method - lambda:", lam_inf)
print("L1 method - lambda:", lam_l1)
print("L2 method - lambda:", lam_l2)
print("L1-Inf method - alpha:", alpha_l1_inf, "lambda:", lam_l1_inf)
print("L1-L2 method - alpha:", alpha_l1_l2, "lambda:", lam_l1_l2)

In [None]:
w_inf = our(Y1_pre, Y0_pre, alpha_inf, lam_inf, 'inf')
w_l1 = our(Y1_pre, Y0_pre, alpha_l1, lam_l1, 'l1')
w_l2 = our(Y1_pre, Y0_pre, alpha_l2, lam_l2, 'l2')
w_l1_inf = our(Y1_pre, Y0_pre, alpha_l1_inf, lam_l1_inf, 'l1-inf')
w_l1_l2 = our(Y1_pre, Y0_pre, alpha_l1_l2, lam_l1_l2, 'l1-l2')

## synthetic control
w_sc = sc(Y1_pre, Y0_pre)

In [None]:
Y0_post_plus = np.hstack([np.ones((Y1_post.shape[0], 1)), Y0_post])

tau_sc = Y1_post - Y0_post @ w_sc
tau_inf = Y1_post - Y0_post_plus @ w_inf
tau_l1 = Y1_post - Y0_post_plus @ w_l1
tau_l2 = Y1_post - Y0_post_plus @ w_l2
tau_l1_inf = Y1_post - Y0_post_plus @ w_l1_inf
tau_l1_l2 = Y1_post - Y0_post_plus @ w_l1_l2

In [None]:
W = np.array([w_sc, w_l1[1:], w_l2[1:], w_l1_l2[1:], w_inf[1:], w_l1_inf[1:]])
mu = np.array([0, w_l1[0], w_l2[0], w_l1_l2[0], w_inf[0], w_l1_inf[0]])
Tau = np.array([tau_sc, tau_l1, tau_l2, tau_l1_l2, tau_inf, tau_l1_inf])

In [None]:
SC_outcomes = np.vstack([Z0, Y0]).dot(W.T) + mu.reshape(1, len(methods))
CA_outcomes = np.vstack([Z1, Y1]).flatten()
treatment_effect = CA_outcomes[T0:, np.newaxis] - SC_outcomes[T0:, :]
average_treatment_effect = np.cumsum(treatment_effect, axis=0) / np.arange(1, treatment_effect.shape[0] + 1)[:, np.newaxis]

fig = plt.figure(figsize=(6.5,4.5)) 
plt.plot(range(START_TIME, STOP_TIME), CA_outcomes, 'k-', label="True Sales")
for i in range(len(methods)):
    plt.plot(range(START_TIME, STOP_TIME), SC_outcomes[:, i], color=colors[i], linestyle=line_styles[i % 2], label=methods[i])
plt.ylabel('Per-capita Cigarette Sales (in Packs)', fontsize=16)
plt.xlabel('Year', fontsize=16)
plt.legend(loc='lower left', fontsize=14)
plt.axvline(INTERVENTION_TIME)
plt.text(x=INTERVENTION_TIME - 5.3, y=30, 
             s='Passage of Proposition 99',
             fontsize=15)
plt.xlim([START_TIME, STOP_TIME - 1])
plt.ylim([0, 140])
plt.grid()
plt.tick_params(axis='both', labelsize=14)  # Adjust tick label size
plt.tight_layout()
plt.show()
fig.savefig("Tobacco/tobacco.png")
plt.close()

In [None]:
# Create a figure with 1 row and 3 columns for side-by-side plots with custom spacing
import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(14, 6))
# gs = gridspec.GridSpec(1, 3, width_ratios=[1.5, 1.5, 1])  
gs = gridspec.GridSpec(1, 2, width_ratios=[1.5, 1])  

# Plot the first graph (Tobacco/tobacco.png)
# ax0 = fig.add_subplot(gs[0])
# ax0.plot(range(START_TIME, STOP_TIME), CA_outcomes, 'b-', label="True Sales")
# for i in range(len(methods)):
#     ax0.plot(range(START_TIME, STOP_TIME), SC_outcomes[:, i], color=colors[i], linestyle=line_styles[i % 2], label=methods[i])
# ax0.set_ylabel('Per-capita Cigarette Sales (in Packs)', fontsize=16)
# ax0.set_xlabel('Year', fontsize=16)
# ax0.legend(loc='lower left', fontsize=14)
# ax0.axvline(INTERVENTION_TIME)
# ax0.text(x=INTERVENTION_TIME - 5.3, y=30, 
#              s='Passage of Proposition 99',
#              fontsize=15)
# ax0.set_xlim([START_TIME, STOP_TIME - 1])
# ax0.set_ylim([0, 140])
# ax0.grid()
# ax0.tick_params(axis='both', labelsize=14)  # Adjust tick label size
# plt.tight_layout()

# Plot the second graph (Tobacco/tobacco_trt.png)
ax1 = fig.add_subplot(gs[0])
for i in range(len(methods)):
    # ax1.plot(range(INTERVENTION_TIME, STOP_TIME), average_treatment_effect[:, i], color=colors[i], linestyle=line_styles[i % 3], label=methods[i])
    ax1.plot(range(INTERVENTION_TIME, STOP_TIME), 
             treatment_effect[:, i], 
             color=colors[i], 
             linestyle=line_styles[i % 2], 
             label=methods[i], 
             linewidth=1.5)
ax1.axhline(0, color='black', linestyle='-', linewidth=1.0)  # Add a horizontal line at y=0
ax1.set_ylabel('Policy Effect', fontsize=16)
ax1.set_xlabel('Year', fontsize=16)
ax1.set_xlim([INTERVENTION_TIME, STOP_TIME - 1])
ax1.legend(loc='upper right', fontsize=14)
ax1.grid()
ax1.tick_params(axis='both', labelsize=14)  # Adjust tick label size
plt.tight_layout()

# Plot the third graph
ax2 = fig.add_subplot(gs[1])
ax2.barh(methods[::-1], average_treatment_effect[-1][::-1], 
             color=colors[:len(methods)][::-1])
ax2.set_xlabel('Average Treatment Effect', fontsize=16)
ax2.tick_params(axis='both', labelsize=14)
# plt.xticks(rotation=45)
plt.tight_layout()

# Save the combined figure
plt.show()
fig.savefig('Tobacco/tobacco_combined.png', dpi=500)
plt.close()

In [None]:
# Set a threshold for weights
threshold = 1e-3

# Apply threshold to weights
w_inf[np.abs(w_inf) < threshold] = 0
w_l1[np.abs(w_l1) < threshold] = 0
w_l2[np.abs(w_l2) < threshold] = 0
w_l1_inf[np.abs(w_l1_inf) < threshold] = 0
w_l1_l2[np.abs(w_l1_l2) < threshold] = 0

W = np.array([w_sc, w_l1[1:], w_l2[1:], w_l1_l2[1:], w_inf[1:], w_l1_inf[1:]])

data = W.T
df2 = pd.DataFrame(data, index=Z0.columns, columns=methods)
df2 = df2.sort_index(ascending=False)

# Create subplots
ncols = len(methods)  # Change to use all methods in one row
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(10, 6), sharey=True)  # Adjust figsize for one row

# Calculate the maximum absolute value across all methods
max_abs_value = df2.abs().max().max()

# Loop over the methods and plot each as a horizontal bar plot
for idx, method in enumerate(methods):
    i, j = 0, idx  # Adjust to use a single row
    axes[j].set_xlabel(method, fontsize=16)
    axes[j].set_xlim([-max_abs_value, max_abs_value])  # Set x-axis limits using the global max
    axes[j].grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
    axes[j].barh(df2.index, df2[method])
    axes[j].set_yticks(np.arange(len(df2.index)))
    axes[j].set_yticklabels(df2.index, fontsize=10)
    # axes[i,j].legend(fontsize=14)

# Set the y-axis labels only on the first subplot with larger font
axes[0].set_ylabel('States', fontsize=17)  # Adjust to only set ylabel for the first subplot

# Display the plot
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust the rect to make space for the suptitle
# plt.show()
fig.savefig("Tobacco/tobacco_weights.png", dpi=500)