In [23]:
# time seris correlation ranking

import os
import pandas as pd
import statsmodels.api as sm

# Adjust these paths to where your datasets are located
truth_data_path = 'proccesed_data/unique_truth_main_dataset.csv'
datasets_folder_path = 'raw_datasets/supplementary_dataset_TS'

# Load the truth data
truth_data = pd.read_csv(truth_data_path)
truth_data.rename(columns={'abbreviation': 'location_key', 'target_end_date': 'date'}, inplace=True)
truth_data['location_key'] = 'US_' + truth_data['location_key']

# Store results
results = []

# Iterate over each dataset in the folder
for filename in os.listdir(datasets_folder_path):
    if filename.endswith('.csv'):
        dataset_path = os.path.join(datasets_folder_path, filename)
        data = pd.read_csv(dataset_path)
        
        print(f"Processing {dataset_path}...")
        
        if dataset_path == 'raw_datasets/supplementary_dataset_TS/Global_vaccination_search_insights.csv':
            data.rename(columns={'sub_region_1_code': 'location_key'}, inplace=True)
        
        # Merge with truth_data on date and location
        merged_data = pd.merge(truth_data, data, on=['date', 'location_key'])
        
        for column in data.columns:
            if column not in ['date', 'location_key']:
                # Drop rows where the current column or truth_value has NaN
                clean_merged_data = merged_data.dropna(subset=[column, 'truth_value'])

                # Check if there's still data left after dropping NaNs
                if not clean_merged_data.empty:
                    # Check if the data is numeric
                    if pd.api.types.is_numeric_dtype(clean_merged_data[column]):
                        # Prepare data for regression
                        X = sm.add_constant(clean_merged_data[column])  # Add constant
                        y = clean_merged_data['truth_value']

                        # Run linear regression
                        model = sm.OLS(y, X).fit()

                        # Record the result
                        results.append({
                            'filename': filename,
                            'variable': column,
                            'correlation': model.rsquared
                        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by correlation in descending order
results_df = results_df.sort_values(by='correlation', ascending=False)

results_df.to_csv('proccesed_data/OLS_correlation_ranking.csv', index=False)

# Output the sorted results
results_df.head()

Processing raw_datasets/supplementary_dataset_TS/epidemiology.csv...
Processing raw_datasets/supplementary_dataset_TS/by-sex.csv...
Processing raw_datasets/supplementary_dataset_TS/mobility.csv...
Processing raw_datasets/supplementary_dataset_TS/lawatlas-emergency-declarations.csv...
Processing raw_datasets/supplementary_dataset_TS/google-search-trends.csv...


  data = pd.read_csv(dataset_path)


Processing raw_datasets/supplementary_dataset_TS/by-age.csv...
Processing raw_datasets/supplementary_dataset_TS/weather.csv...
Processing raw_datasets/supplementary_dataset_TS/vaccinations.csv...


  data = pd.read_csv(dataset_path)


Processing raw_datasets/supplementary_dataset_TS/Global_vaccination_search_insights.csv...
Processing raw_datasets/supplementary_dataset_TS/oxford-government-response.csv...
Processing raw_datasets/supplementary_dataset_TS/hospitalizations.csv...


Unnamed: 0,filename,variable,correlation
3,epidemiology.csv,new_tested,0.10742
656,hospitalizations.csv,new_hospitalized_patients,0.091322
658,hospitalizations.csv,current_hospitalized_patients,0.086488
6,epidemiology.csv,cumulative_recovered,0.080359
7,epidemiology.csv,cumulative_tested,0.077731
