In [2]:
# Math and Pandas
import numpy as np
import pandas as pd 

# Visualization Stuff
import matplotlib.pyplot as plt
import seaborn as sns

# DB stuff
from sqlalchemy import create_engine
import mariadb
import sys

In [3]:
# Create Database Connection
SQLALCHEMY_DATABASE_URI = 'mysql+pymysql://airq:airq@127.0.0.2:3306/airq_data'
engine = create_engine(SQLALCHEMY_DATABASE_URI)

In [7]:
# Get data from DB and put it into a dataframe
long = pd.read_sql_table('measurements', engine)

# Set timestamp as index-column
long.set_index('timestamp',inplace=True)
long.index = pd.to_datetime(long.index)  # Convert the index to datetime type
long.head()

Unnamed: 0_level_0,TypPS,oxygen,pm10,cnt0_5,co,temperature,performance,co2,measuretime,so2,...,health,temperature_o2,cnt2_5,o3,humidity,dHdt,humidity_abs,sound,pm2_5,cnt0_3
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-22 17:42:49,14.966,20.688,0.007,4.867,1.522,20.867,910.0,548.505,1876,38.461,...,968.0,24.841,0.014,11.55,44.833,-0.03,8.182,21.999,0.0,16.234
2023-03-22 17:44:48,14.996,20.687,0.001,3.496,1.521,20.865,915.0,543.5,1887,38.462,...,942.0,24.845,0.002,11.52,44.784,-0.06,8.172,24.962,0.0,14.324
2023-03-22 17:46:50,15.0,20.687,0.0,5.134,1.521,20.866,916.0,542.134,1858,38.463,...,938.0,24.834,0.0,11.506,44.768,-0.07,8.17,25.293,0.0,15.826
2023-03-22 17:48:48,14.081,20.688,0.213,4.223,1.521,20.864,918.0,539.314,1856,38.462,...,936.0,24.834,0.618,11.438,44.766,-0.04,8.169,26.403,0.0,12.719
2023-03-22 17:50:49,14.258,20.687,0.152,8.701,1.522,20.867,915.0,543.551,1862,38.462,...,935.0,24.838,0.543,11.393,44.768,0.0,8.17,25.934,0.0,26.107


In [8]:
# Delete metrics that are not usefull right now
long.drop('measuretime', axis = 1, inplace=True)
long.drop('health', axis = 1, inplace=True)
long.drop('performance', axis = 1, inplace=True)

long.columns

Index(['TypPS', 'oxygen', 'pm10', 'cnt0_5', 'co', 'temperature', 'co2', 'so2',
       'no2', 'cnt5', 'pm1', 'cnt1', 'dewpt', 'tvoc', 'pressure', 'cnt10',
       'dCO2dt', 'sound_max', 'temperature_o2', 'cnt2_5', 'o3', 'humidity',
       'dHdt', 'humidity_abs', 'sound', 'pm2_5', 'cnt0_3'],
      dtype='object')

In [9]:

short = long[long.index.month == 5]  # Filter the data for the month of May (month number = 5)

short.head()

Unnamed: 0_level_0,TypPS,oxygen,pm10,cnt0_5,co,temperature,co2,so2,no2,cnt5,...,sound_max,temperature_o2,cnt2_5,o3,humidity,dHdt,humidity_abs,sound,pm2_5,cnt0_3
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-05-01 00:01:57,8.234,20.908,0.75,69.669,1.666,19.885,452.664,108.993,32.377,0.063,...,53.8,23.873,0.507,11.233,46.336,0.0,7.986,50.079,0.509,223.084
2023-05-01 00:03:56,12.723,20.913,0.195,61.033,1.666,19.88,455.396,109.241,32.443,0.007,...,59.1,23.857,0.06,11.228,46.382,0.02,7.991,49.856,0.166,199.155
2023-05-01 00:05:57,6.421,20.915,0.636,81.678,1.665,19.871,453.948,109.349,32.096,0.001,...,53.8,23.85,0.007,11.27,46.398,0.02,7.99,49.894,0.633,260.464
2023-05-01 00:07:57,4.261,20.916,0.77,82.28,1.666,19.87,456.784,109.395,31.996,0.0,...,52.9,23.849,0.001,11.289,46.401,0.01,7.99,49.984,0.769,252.504
2023-05-01 00:09:57,3.769,20.915,0.988,78.312,1.666,19.871,455.627,109.493,32.291,0.246,...,54.9,23.85,0.246,11.288,46.402,0.0,7.991,49.997,0.804,243.669


In [12]:
corr_short = short.corr()
corr_long = long.corr()

# Method 2: Compare overall correlation similarity using correlation coefficient
correlation_similarity = np.corrcoef(corr_short.values.flatten(), corr_long.values.flatten())[0, 1]
print("Correlation Similarity:", correlation_similarity)

Correlation Similarity: 0.9301602149778943
