## Preprocessing

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from keras_tuner import RandomSearch
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Input


In [3]:
pres_gdp = pd.read_csv("presidentfinal_df.csv")
pres_bday = pd.read_csv("birthdates-of-us-presidents.csv")
pres_bday=pres_bday.drop("csvbase_row_id",axis=1)
pres_bday = pres_bday.rename(columns={"Date of birth":"birthday","Name":"name"})
pres_gdp=pres_gdp.rename(columns={  'Year':'year',
                                    'President':'president',
                                    'Party':'party',
                                    'Nominal GDP (million of Dollars)':"nominal_gdp(million_of_dollars)",
                                    'Real GDP (millions of 2017 dollars)':'real_gdp(millions_of_2017_dollars)',
                                    'GDP Deflator (index 2017=100)':'gdp_deflator_(index_2017=100)',
                                    'Population':'population',
                                    'Nominal GDP per capita (current dollars)':'nominal_gdp_per_capita_(current_dollars)',
                                    'Real GDP per capita (year 2017 dollars)':'real_gdp_per_capita_(year_2017_dollars)'
})


In [4]:
pres_gdp.head(10)

Unnamed: 0,year,president,party,nominal_gdp(million_of_dollars),real_gdp(millions_of_2017_dollars),gdp_deflator_(index_2017=100),population,nominal_gdp_per_capita_(current_dollars),real_gdp_per_capita_(year_2017_dollars)
0,1789,George Washington,,138.66,6894.01,2.01,6698461.39,20.7,1029.19
1,1790,George Washington,,193.0,4975.0,3.88,3929000.0,49.1,1266.23
2,1791,George Washington,,210.0,5274.0,3.98,4048000.0,51.89,1302.85
3,1792,George Washington,,230.0,5663.0,4.05,4171000.0,55.03,1357.63
4,1793,George Washington,,256.0,6113.0,4.19,4297000.0,59.67,1422.52
5,1794,George Washington,,321.0,6905.0,4.65,4428000.0,72.56,1559.46
6,1795,George Washington,,390.0,7331.0,5.32,4562000.0,85.53,1607.03
7,1796,George Washington,,423.0,7553.0,5.6,4700000.0,90.06,1606.92
8,1797,John Adams,Federalist,415.0,7692.0,5.39,4843000.0,85.65,1588.26
9,1798,John Adams,Federalist,418.0,8009.0,5.22,4990000.0,83.73,1605.09


In [5]:
pres_bday = pres_bday.rename(columns={"name" : "president"})
pres_bday.head()

Unnamed: 0,president,birthday
0,George Washington,1732-02-22
1,John Adams,1735-10-30
2,Thomas Jefferson,1743-04-13
3,James Madison,1751-03-16
4,James Monroe,1758-04-28


In [6]:
joined_df = pres_bday.merge(pres_gdp, on="president", how="left")

joined_df['party'].fillna("none", inplace=True)

joined_df.fillna(0, inplace=True)

joined_df.head()

Unnamed: 0,president,birthday,year,party,nominal_gdp(million_of_dollars),real_gdp(millions_of_2017_dollars),gdp_deflator_(index_2017=100),population,nominal_gdp_per_capita_(current_dollars),real_gdp_per_capita_(year_2017_dollars)
0,George Washington,1732-02-22,1789.0,none,138.66,6894.01,2.01,6698461.39,20.7,1029.19
1,George Washington,1732-02-22,1790.0,none,193.0,4975.0,3.88,3929000.0,49.1,1266.23
2,George Washington,1732-02-22,1791.0,none,210.0,5274.0,3.98,4048000.0,51.89,1302.85
3,George Washington,1732-02-22,1792.0,none,230.0,5663.0,4.05,4171000.0,55.03,1357.63
4,George Washington,1732-02-22,1793.0,none,256.0,6113.0,4.19,4297000.0,59.67,1422.52


In [7]:
base_df = joined_df.copy()

base_df= base_df[['president', 'birthday','year','party','nominal_gdp(million_of_dollars)','nominal_gdp_per_capita_(current_dollars)']]

base_df['nominal_gdp(million_of_dollars)']= base_df['nominal_gdp(million_of_dollars)'].str.replace(',', '').astype(float)
base_df['nominal_gdp_per_capita_(current_dollars)']= base_df['nominal_gdp_per_capita_(current_dollars)'].str.replace(',', '').astype(float)
base_df.head()



Unnamed: 0,president,birthday,year,party,nominal_gdp(million_of_dollars),nominal_gdp_per_capita_(current_dollars)
0,George Washington,1732-02-22,1789.0,none,138.66,20.7
1,George Washington,1732-02-22,1790.0,none,193.0,49.1
2,George Washington,1732-02-22,1791.0,none,210.0,51.89
3,George Washington,1732-02-22,1792.0,none,230.0,55.03
4,George Washington,1732-02-22,1793.0,none,256.0,59.67


In [8]:
drop_df = base_df.drop(columns=["president",'party'])
drop_df.head()

Unnamed: 0,birthday,year,nominal_gdp(million_of_dollars),nominal_gdp_per_capita_(current_dollars)
0,1732-02-22,1789.0,138.66,20.7
1,1732-02-22,1790.0,193.0,49.1
2,1732-02-22,1791.0,210.0,51.89
3,1732-02-22,1792.0,230.0,55.03
4,1732-02-22,1793.0,256.0,59.67


In [9]:
# Convert categorical data to numeric with `pd.get_dummies` --- creates booleans of the categories
dummy_df1 = base_df[['president','party']]

dummy_df1.index=base_df['year']

dummy_df2 = pd.get_dummies(dummy_df1)

dummy_df2.head()

Unnamed: 0_level_0,president_Abraham Lincoln,president_Andrew Jackson,president_Andrew Johnson,president_Barack Obama,president_Benjamin Harrison,president_Bill Clinton,president_Calvin Coolidge,president_Chester A. Arthur,president_Donald Trump,president_Dwight D. Eisenhower,...,president_William McKinley,president_Woodrow Wilson,president_Zachary Taylor,party_Democrat,party_Democratic-Republican,party_Federalist,party_National Union,party_Republican,party_Whig,party_none
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1789.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1790.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1791.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1792.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1793.0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [10]:
dummy_df3 = pd.merge(drop_df, dummy_df2, on="year")

dummy_df3['birthday'] = pd.to_datetime(dummy_df3['birthday'])
# dummy_df3['birthday'] = dummy_df3['birthday'].astype(str)
# dummy_df3['birthday']= dummy_df3['birthday'].apply(lambda x: x.encode('utf-8'))
# dummy_df3['birthday']= dummy_df3['birthday'].apply(lambda x: x.decode('utf-8'))
dummy_df3['birthday'] = dummy_df3['birthday'].view('int64') // 10**9
# dummy_df3['birthday'] = dummy_df3['birthday'].astype(int)
dummy_df3.head()
dummy_df3.dtypes

birthday                                      int64
year                                        float64
nominal_gdp(million_of_dollars)             float64
nominal_gdp_per_capita_(current_dollars)    float64
president_Abraham Lincoln                      bool
president_Andrew Jackson                       bool
president_Andrew Johnson                       bool
president_Barack Obama                         bool
president_Benjamin Harrison                    bool
president_Bill Clinton                         bool
president_Calvin Coolidge                      bool
president_Chester A. Arthur                    bool
president_Donald Trump                         bool
president_Dwight D. Eisenhower                 bool
president_Franklin D. Roosevelt                bool
president_Franklin Pierce                      bool
president_George H. W. Bush                    bool
president_George W. Bush                       bool
president_George Washington                    bool
president_Ge

In [11]:
dummy_df3.shape

(253, 56)

In [12]:
# # lets split x and y for the modeling
# X = dummy_df3['birthday']
# X= pd.DataFrame(X)


In [13]:
# y= dummy_df3.drop(columns=['birthday','year'])
# y

In [14]:
y=dummy_df3["birthday"].values.reshape(-1,1)
X=dummy_df3.loc[:, (dummy_df3.columns != "birthday") & (dummy_df3.columns != "year")]

X

Unnamed: 0,nominal_gdp(million_of_dollars),nominal_gdp_per_capita_(current_dollars),president_Abraham Lincoln,president_Andrew Jackson,president_Andrew Johnson,president_Barack Obama,president_Benjamin Harrison,president_Bill Clinton,president_Calvin Coolidge,president_Chester A. Arthur,...,president_William McKinley,president_Woodrow Wilson,president_Zachary Taylor,party_Democrat,party_Democratic-Republican,party_Federalist,party_National Union,party_Republican,party_Whig,party_none
0,138.66,20.70,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,193.00,49.10,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,210.00,51.89,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,230.00,55.03,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,256.00,59.67,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,21521395.00,65115.12,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
249,21322950.00,64266.79,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
250,23594031.00,70991.30,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
251,25744108.00,77171.74,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [15]:
X.shape

(253, 54)

In [16]:
#train_size = int(len(df) * 0.8) X_train, X_test = X[:train_size], X[train_size:] y_train, y_test = y[:train_size], y[train_size:]

# Split the preprocessed data into a training and testing dataset -- choose to stratify so that the classes are more balanced out bc it's a small dataset and imbalanced
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2, random_state=42)

In [17]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler on the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [18]:

# Define a function that builds the model
def build_model(hp):
    nn = tf.keras.models.Sequential()

    # Input layer
    nn.add(Input(shape=(input_shape,)))

    # First hidden layer
    hp_units_1 = hp.Int('units_1', min_value=32, max_value=512, step=32)
    nn.add(Dense(units=hp_units_1, activation='relu'))

    # Optional second hidden layer (conditional on adding a second layer)
    if hp.Boolean('use_second_layer'):
        hp_units_2 = hp.Int('units_2', min_value=32, max_value=512, step=32)
        nn.add(Dense(units=hp_units_2, activation='relu'))

    # Dropout layer
    hp_dropout = hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)
    nn.add(Dropout(rate=hp_dropout))

    # Output layer
    nn.add(Dense(units=1, activation='sigmoid'))

    # Compile the model with an optimizer hyperparameter
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    nn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
               loss='binary_crossentropy',
               metrics=['accuracy'])

    return nn

In [19]:
# Initialize the Keras Tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='keras_tuning_example'
)

NameError: name 'input_shape' is not defined

In [None]:
# Run the hyperparameter search
tuner.search(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))

In [None]:
# Retrieve the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Summary of the best model
best_model.summary()

In [None]:
# Evaluate the best model using the test data
best_model_loss, best_model_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Best Model Loss: {best_model_loss}, Best Model Accuracy: {best_model_accuracy}")

In [None]:
# Export our model to HDF5 file
best_model.save('model2caite.h5')