## Preprocessing

In [4]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
# import tensorflow as tf
from scipy.stats import linregress
import numpy as np

In [6]:
# Import and read the charity_data.csv with the correct header row
application_df = pd.read_csv(
    "https://raw.githubusercontent.com/Bgrlgymnast/the-a-team/main/USGDP_1790-2023.csv",
    header=1
)

# Now filter the data to exclude years with zero values (1787-1789)
filtered_df = application_df[application_df['Year'] >= 1790]

# Columns to be filled using logarithmic regression
columns_to_fill = ['Nominal GDP (million of Dollars)',
                   'Real GDP (millions of 2017 dollars)',
                   'GDP Deflator (index 2017=100)',
                   'Population',
                   'Nominal GDP per capita (current dollars)',
                   'Real GDP per capita (year 2017 dollars)']

# Perform logarithmic regression and fill in missing values for each column
for column in columns_to_fill:
    # Prepare the data for regression
    x = filtered_df['Year']
    y = filtered_df[column].replace(',', '', regex=True).astype(float)

    # Apply log transformation
    log_y = np.log(y)

    # Perform linear regression on log-transformed data
    slope, intercept, _, _, _ = linregress(x, log_y)

    # Predict the values for 1787-1789 using the log model
    for year in range(1787, 1790):
        log_predicted_value = intercept + slope * year
        predicted_value = np.exp(log_predicted_value)  # Convert back from log
        application_df.loc[application_df['Year'] == year, column] = round(predicted_value,2)

# Save the processed DataFrame to a new CSV file
output_filename = "GDP_logregressed.csv"
application_df.to_csv(output_filename, index=False)

output_filename

'GDP_logregressed.csv'