In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.decomposition import PCA
from Create_Feature import *

In [2]:
# Read in the normalized training data
full_train = pd.read_csv('Prepared_Data/Normalized_Missing_Value_Filled.csv')

# Read in the test dataset
test_dataset = pd.read_csv('Prepared_Data/Test_data_normalized.csv')

# PCA Column Combine

In [3]:
# Get the columns for correlation
corr_columns = [x for x in full_train.columns if x.startswith('WS')]

# Get correlation dictionary 
corr_dict = target_correlation_rank(full_train, 'CF', corr_columns)

# Split the variables
use_variable, combine_variable = variable_to_use(corr_dict)

# Get the pca & pca_component
pca, pca_component = create_PCA(full_train, 1, list(combine_variable.keys()), 0.7)

In [4]:
# New Training Data Construct
new_training = full_train[['DATETIME','CF']].copy()
# Create calendar features
create_calendar_feature(new_training, 'DATETIME')
# Input the PCA Vector
for i in range(pca_component):
    new_training['low_corr_PCA_{}'.format(i+1)] = pca.transform(full_train[combine_variable.keys()]).T[i]

# New test dataset construct
new_test = test_dataset[['DATETIME','CF']].copy()
# Create calendar features
create_calendar_feature(new_test,'DATETIME')
# Input the PCA Vector
for i in range(pca_component):
    new_test['low_corr_PCA_{}'.format(i+1)] = pca.transform(test_dataset[combine_variable.keys()]).T[i]

# Check the correlation of the variables to be used

In [5]:
# Check the correlation matrix
full_train[use_variable.keys()].corr()

Unnamed: 0,WS_2503597,WS_2508550,WS_2508545,WS_75936,WS_75935,WS_1953863,WS_77363,WS_75934,WS_2385763,WS_1358699,...,WS_2455069,WS_2118117,WS_2479342,WS_839753,WS_75931,WS_72720,WS_1911282,WS_78026,WS_75213,WS_77080
WS_2503597,1.0,0.905427,0.881287,0.881262,0.873305,-0.413239,0.864748,0.854457,-0.410288,-0.383627,...,0.299705,-0.308829,-0.307087,0.662701,0.699589,0.699468,-0.315601,0.613371,0.700333,0.711796
WS_2508550,0.905427,1.0,0.991695,0.991686,0.974953,-0.353438,0.964688,0.968915,-0.368953,-0.395853,...,0.219258,-0.366334,-0.364585,0.639208,0.836632,0.836111,-0.305672,0.695709,0.830845,0.83322
WS_2508545,0.881287,0.991695,1.0,0.999992,0.993148,-0.346683,0.987227,0.990884,-0.352662,-0.402064,...,0.215507,-0.369551,-0.368107,0.623911,0.825678,0.824979,-0.294037,0.659426,0.809598,0.802193
WS_75936,0.881262,0.991686,0.999992,1.0,0.99316,-0.346671,0.987238,0.990891,-0.352688,-0.402063,...,0.215435,-0.369628,-0.368185,0.623887,0.825684,0.824982,-0.294062,0.659442,0.80961,0.80221
WS_75935,0.873305,0.974953,0.993148,0.99316,1.0,-0.34416,0.998262,0.995524,-0.336376,-0.394413,...,0.225216,-0.362174,-0.360989,0.617687,0.783293,0.78242,-0.284384,0.598827,0.75925,0.748356
WS_1953863,-0.413239,-0.353438,-0.346683,-0.346671,-0.34416,1.0,-0.335214,-0.325918,0.654375,0.334106,...,-0.346937,0.096824,0.098182,-0.293072,-0.231407,-0.230669,0.295989,-0.230909,-0.235901,-0.236247
WS_77363,0.864748,0.964688,0.987227,0.987238,0.998262,-0.335214,1.0,0.99621,-0.32547,-0.378946,...,0.218038,-0.359358,-0.358203,0.619744,0.775041,0.774077,-0.275193,0.579864,0.748041,0.735643
WS_75934,0.854457,0.968915,0.990884,0.990891,0.995524,-0.325918,0.99621,1.0,-0.324617,-0.380508,...,0.196374,-0.366415,-0.365098,0.611201,0.808262,0.807354,-0.269586,0.611842,0.781153,0.766297
WS_2385763,-0.410288,-0.368953,-0.352662,-0.352688,-0.336376,0.654375,-0.32547,-0.324617,1.0,0.246054,...,-0.328082,0.048852,0.048436,-0.346358,-0.308051,-0.308719,0.312682,-0.309046,-0.307277,-0.302817
WS_1358699,-0.383627,-0.395853,-0.402064,-0.402063,-0.394413,0.334106,-0.378946,-0.380508,0.246054,1.0,...,-0.22594,0.151364,0.154566,-0.038855,-0.239216,-0.239683,0.182435,-0.274158,-0.211442,-0.182355


In [6]:
# Get variable cluster
variable_cluster = create_cluster_list(full_train, list(use_variable.keys()))

In [7]:
# Loop through the variable cluster
for i in range(len(variable_cluster)):
    use_pca, use_pca_component = create_PCA(full_train, 1, variable_cluster[i], 0.9)
    # Loop thought the pca component
    for cnt in range(use_pca_component):
        new_test['high_corr_cluster{}_PCA_{}'.format(i+1,cnt+1)] = use_pca.transform(test_dataset[variable_cluster[i]]).T[cnt]
        new_training['high_corr_cluster{}_PCA_{}'.format(i+1,cnt+1)] = use_pca.transform(full_train[variable_cluster[i]]).T[cnt]

In [9]:
# Write the output to folder
new_training.to_csv('Prepared_Data/Training_DF_PCA.csv', index=False)
new_test.to_csv('Prepared_Data/Test_DF_PCA.csv', index=False)