In [2]:
# time seris correlation ranking

import os
import pandas as pd
import statsmodels.api as sm

# Adjust these paths to where your datasets are located
truth_data_path = 'proccesed_data/unique_truth_main_dataset.csv'
datasets_folder_path = 'raw_datasets/supplementary_dataset_TS'

# Load the truth data
truth_data = pd.read_csv(truth_data_path)
truth_data.rename(columns={'abbreviation': 'location_key', 'target_end_date': 'date'}, inplace=True)
truth_data['location_key'] = 'US_' + truth_data['location_key']

# Store results
results = []

# Iterate over each dataset in the folder
for filename in os.listdir(datasets_folder_path):
    if filename.endswith('.csv'):
        dataset_path = os.path.join(datasets_folder_path, filename)
        data = pd.read_csv(dataset_path)
        
        print(f"Processing {dataset_path}...")
        
        # Merge with truth_data on date and location
        merged_data = pd.merge(truth_data, data, on=['date', 'location_key'])
        
        for column in data.columns:
            if column not in ['date', 'location_key']:
                # Drop rows where the current column or truth_value has NaN
                clean_merged_data = merged_data.dropna(subset=[column, 'truth_value'])

                # Check if there's still data left after dropping NaNs
                if not clean_merged_data.empty:
                    # Check if the data is numeric
                    if pd.api.types.is_numeric_dtype(clean_merged_data[column]):
                        # Prepare data for regression
                        X = sm.add_constant(clean_merged_data[column])  # Add constant
                        y = clean_merged_data['truth_value']

                        # Run linear regression
                        model = sm.OLS(y, X).fit()

                        # Record the result
                        results.append({
                            'filename': filename,
                            'variable': column,
                            'correlation': model.rsquared
                        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by correlation in descending order
results_df = results_df.sort_values(by='correlation', ascending=False)

results_df.to_csv('proccesed_data/OLS_correlation_ranking.csv', index=False)

# Output the sorted results
results_df.head()

  data = pd.read_csv(dataset_path)


Processing raw_datasets/supplementary_dataset_TS\by-age.csv...
Processing raw_datasets/supplementary_dataset_TS\by-sex.csv...
Processing raw_datasets/supplementary_dataset_TS\epidemiology.csv...


  data = pd.read_csv(dataset_path)


Processing raw_datasets/supplementary_dataset_TS\Global_vaccination_search_insights.csv...
Processing raw_datasets/supplementary_dataset_TS\google-search-trends.csv...
Processing raw_datasets/supplementary_dataset_TS\hospitalizations.csv...
Processing raw_datasets/supplementary_dataset_TS\lawatlas-emergency-declarations.csv...
Processing raw_datasets/supplementary_dataset_TS\mobility.csv...
Processing raw_datasets/supplementary_dataset_TS\oxford-government-response.csv...
Processing raw_datasets/supplementary_dataset_TS\vaccinations.csv...
Processing raw_datasets/supplementary_dataset_TS\weather.csv...


Unnamed: 0,filename,variable,correlation
87,epidemiology.csv,new_tested,0.10742
513,hospitalizations.csv,new_hospitalized_patients,0.091322
515,hospitalizations.csv,current_hospitalized_patients,0.086488
90,epidemiology.csv,cumulative_recovered,0.080359
91,epidemiology.csv,cumulative_tested,0.077731


In [4]:
# ranked data merging

import os
import pandas as pd

# Load the truth data
truth_data_path = 'proccesed_data/unique_truth_main_dataset.csv'
datasets_folder_path = 'raw_datasets/supplementary_dataset_TS'
truth_data = pd.read_csv(truth_data_path)
truth_data.rename(columns={'abbreviation': 'location_key', 'target_end_date': 'date'}, inplace=True)
truth_data['location_key'] = 'US_' + truth_data['location_key']

# Load the correlation data
correlation_data_path = 'proccesed_data\OLS_correlation_ranking.csv'
correlation_data = pd.read_csv(correlation_data_path)

# Filter variables with R^2 greater than 0.04
selected_variables = correlation_data[correlation_data['correlation'] > 0.04]

# Iterate over each dataset in the folder
for filename in os.listdir(datasets_folder_path):
    if filename.endswith('.csv'):
        dataset_path = os.path.join(datasets_folder_path, filename)
        data = pd.read_csv(dataset_path)
        
        print(f"Processing {dataset_path}...")
        
        # Process only selected variables for the current dataset
        for _, row in selected_variables.iterrows():
            if row['filename'] == filename:
                column = row['variable']
                
                # Verify if the column exists in the current dataset
                if column in data.columns:
                    # Create a temporary DataFrame to hold the current column's data
                    temp_data = data[['date', 'location_key', column]].dropna()

                    # Merge the temporary DataFrame with the truth_data DataFrame
                    truth_data = pd.merge(truth_data, temp_data, on=['date', 'location_key'], how='left', suffixes=('', '_temp'))

                    # Check if the merged column exists (avoid overwriting existing data)
                    if f'{column}_temp' in truth_data.columns:
                        # If the column already exists in truth_data, we combine the original and new columns
                        truth_data[column] = truth_data[column].combine_first(truth_data[f'{column}_temp'])
                        # Drop the temporary column after combining
                        truth_data.drop(columns=[f'{column}_temp'], inplace=True)

# After processing all files, save the updated truth_data DataFrame
truth_data.to_csv('proccesed_data/ranked_merged_dataset.csv', index=False)

truth_data.head()

  data = pd.read_csv(dataset_path)


Processing raw_datasets/supplementary_dataset_TS\by-age.csv...
Processing raw_datasets/supplementary_dataset_TS\by-sex.csv...
Processing raw_datasets/supplementary_dataset_TS\epidemiology.csv...


  data = pd.read_csv(dataset_path)


Processing raw_datasets/supplementary_dataset_TS\Global_vaccination_search_insights.csv...
Processing raw_datasets/supplementary_dataset_TS\google-search-trends.csv...
Processing raw_datasets/supplementary_dataset_TS\hospitalizations.csv...
Processing raw_datasets/supplementary_dataset_TS\lawatlas-emergency-declarations.csv...
Processing raw_datasets/supplementary_dataset_TS\mobility.csv...
Processing raw_datasets/supplementary_dataset_TS\oxford-government-response.csv...
Processing raw_datasets/supplementary_dataset_TS\vaccinations.csv...
Processing raw_datasets/supplementary_dataset_TS\weather.csv...


Unnamed: 0,date,location_name,truth_value,location_key,cumulative_confirmed_age_3,new_tested_age_0,cumulative_confirmed_age_2,new_deceased_male,new_deceased_female,new_tested,...,cumulative_hospitalized_patients,cumulative_vaccine_doses_administered_pfizer,cumulative_vaccine_doses_administered_moderna,cumulative_vaccine_doses_administered,cumulative_vaccine_doses_administered_janssen,cumulative_persons_fully_vaccinated_pfizer,cumulative_persons_fully_vaccinated_janssen,cumulative_persons_vaccinated,cumulative_persons_fully_vaccinated_moderna,cumulative_persons_fully_vaccinated
0,2020-03-28,Alabama,0,US_AL,,,,,,109.0,...,,,,,,,,,,
1,2020-03-28,Alaska,0,US_AK,,,,,,413.0,...,13.0,,,,,,,,,
2,2020-03-28,Arizona,0,US_AZ,,,,,,2407.0,...,386.0,,,,,,,,,
3,2020-03-28,Arkansas,2,US_AR,,,,,,1416.0,...,,,,,,,,,,
4,2020-03-28,California,18,US_CA,,,,,,3933.0,...,,,,,,,,,,


In [7]:
import pandas as pd
import numpy as np

results_df = pd.read_csv('proccesed_data/ranked_merged_dataset.csv')

missing_value_percentages = results_df.select_dtypes(include=[np.number]).isnull().mean().sort_values() * 100

missing_value_percentages.to_csv('proccesed_data/missing_value_percentages.csv', header=False)

print(missing_value_percentages)

truth_value                                       0.000000
cumulative_deceased                               0.000000
new_confirmed                                     0.000000
cumulative_confirmed                              0.000000
new_hospitalized_patients                         0.349740
current_hospitalized_patients                     3.588388
cumulative_hospitalized_patients                  6.519947
current_intensive_care_patients                   9.522591
cumulative_vaccine_doses_administered            42.261651
cumulative_persons_vaccinated                    42.520401
cumulative_persons_fully_vaccinated              42.679633
new_tested                                       48.906708
cumulative_tested                                48.909551
cumulative_persons_fully_vaccinated_janssen      49.509511
cumulative_persons_fully_vaccinated_pfizer       49.509511
cumulative_vaccine_doses_administered_moderna    49.509511
cumulative_vaccine_doses_administered_janssen    49.5095

In [39]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
final_dataset = pd.read_csv('proccesed_data/ranked_merged_dataset.csv')
final_dataset.drop(columns=['location_name'], inplace=True)

# Load correlation data to filter variables
selected_variables = [
    'cumulative_deceased',
    'new_confirmed',
    'cumulative_confirmed',
    'new_hospitalized_patients',
    'current_hospitalized_patients',
    'cumulative_hospitalized_patients',
    'current_intensive_care_patients',
    'cumulative_vaccine_doses_administered',
    'cumulative_persons_vaccinated',
    'cumulative_persons_fully_vaccinated',
    'new_tested',
    'cumulative_tested',
    'cumulative_recovered',
    'new_recovered',
    'location_key_US_AK', 'location_key_US_AL', 'location_key_US_AR', 'location_key_US_AZ', 'location_key_US_CA', 'location_key_US_CO', 'location_key_US_CT', 'location_key_US_DE', 'location_key_US_FL', 'location_key_US_GA', 'location_key_US_HI', 'location_key_US_IA', 'location_key_US_ID', 'location_key_US_IL', 'location_key_US_IN', 'location_key_US_KS', 'location_key_US_KY', 'location_key_US_LA', 'location_key_US_MA', 'location_key_US_MD', 'location_key_US_ME', 'location_key_US_MI', 'location_key_US_MN', 'location_key_US_MO', 'location_key_US_MS', 'location_key_US_MT', 'location_key_US_NC', 'location_key_US_ND', 'location_key_US_NE', 'location_key_US_NH', 'location_key_US_NJ', 'location_key_US_NM', 'location_key_US_NV', 'location_key_US_NY', 'location_key_US_OH', 'location_key_US_OK', 'location_key_US_OR', 'location_key_US_PA', 'location_key_US_PR', 'location_key_US_RI', 'location_key_US_SC', 'location_key_US_SD', 'location_key_US_TN', 'location_key_US_TX', 'location_key_US_UT', 'location_key_US_VA', 'location_key_US_VT', 'location_key_US_WA', 'location_key_US_WI', 'location_key_US_WV', 'location_key_US_WY'
]

pre_selected_variables = [
    'cumulative_deceased',
    'new_confirmed',
    'cumulative_confirmed',
    'new_hospitalized_patients',
    'current_hospitalized_patients',
    'cumulative_hospitalized_patients',
    'current_intensive_care_patients',
    'cumulative_vaccine_doses_administered',
    'cumulative_persons_vaccinated',
    'cumulative_persons_fully_vaccinated',
    'new_tested',
    'cumulative_tested',
    'cumulative_recovered',
    'new_recovered'
]

# Impute missing values for numerical columns
for column in pre_selected_variables:
    if final_dataset[column].dtype != 'object':  # If column is numerical
        final_dataset[column].fillna(final_dataset[column].mean(), inplace=True)
        
X = X.apply(pd.to_numeric, errors='coerce')

X.fillna(X.mean(), inplace=True)

y = pd.to_numeric(y, errors='coerce').fillna(y.mean())

# Create dummy variables for 'location_key'
final_dataset = pd.get_dummies(final_dataset, columns=['location_key'], dtype=int)

# Define independent variables (X) and dependent variable (y)
# Ensure 'location_key' dummies are included in X
X = final_dataset[[col for col in final_dataset.columns if col in selected_variables or 'location_key_' in col]]
y = final_dataset['truth_value']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Use sm.OLS to perform the regression and fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:            truth_value   R-squared:                       0.127
Model:                            OLS   Adj. R-squared:                  0.125
Method:                 Least Squares   F-statistic:                     73.80
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        18:01:12   Log-Likelihood:            -3.8569e+05
No. Observations:               35169   AIC:                         7.715e+05
Df Residuals:                   35099   BIC:                         7.721e+05
Df Model:                          69                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------