# Imports

In [11]:
# ALL
import os
import re

# AS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# FROM
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from PIL import Image
from scipy.stats import skew
from tqdm import tqdm
from collections import Counter

# FROM FILE
from get_data_stats import *
from load_metadata import *
from load_and_read_data import *
from plot_charts_and_graphs import *

# Loading and Reading Data

## Data folder and files

In [12]:
data_folder_path = '/Users/izzymohamed/Downloads/Telegram Desktop/Dataset/'

In [13]:
forecast_open_meteo_path = data_folder_path + 'forecasts/open_meteo.csv'

In [14]:
print('Forecast dataset path:', forecast_open_meteo_path)

Forecast dataset path: /Users/izzymohamed/Downloads/Telegram Desktop/Dataset/forecasts/open_meteo.csv


## Storig into variables

In [15]:
# forecast_open_meteo_csv = create_ref_df(forecast_path)

In [16]:
# print(forecast_open_meteo_csv['file_format'])

In [17]:
# For tabular data (e.g., CSV files), use pandas
# forecast_info_csv_df = read_csv(forecast_path + 'forecast_info.csv')

In [18]:
# Load the Forecast dataset
forecast_open_meteo_csv = load_data(forecast_open_meteo_path, 'single')

# Data Cleaning and Preprocessing

# Data Integration

For projects involving longitudinal data (OASIS-2, OASIS-3, OASIS-4), you may need to integrate data across time points or modalities.

# Exploratory Data Analysis (EDA)

##  Statistical Summary

In [19]:
forecast_open_meteo_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5292 entries, 0 to 5291
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   timestamp           5292 non-null   object
 1   temperature         5292 non-null   object
 2   humidity            5292 non-null   object
 3   pressure_msl        5292 non-null   object
 4   pressure_surface    5292 non-null   object
 5   global_irradiance   5292 non-null   object
 6   direct_irradiance   5292 non-null   object
 7   diffuse_irradiance  5292 non-null   object
 8   cloud_cover         5292 non-null   object
 9   wind_speed          5292 non-null   object
 10  wind_direction      5292 non-null   object
 11  precipitation       5292 non-null   object
 12  rain                5292 non-null   object
 13  showers             5292 non-null   object
 14  snowfall            5292 non-null   object
 15  weather_code        5292 non-null   object
dtypes: object(16)
memory usa

In [20]:
forecast_image_stats = get_csv_stats(forecast_open_meteo_csv)

Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object
Column type: object


In [21]:
forecast_image_stats

Unnamed: 0,mean,std,min,max,skew
timestamp,,,,,
temperature,,,,,
humidity,,,,,
pressure_msl,,,,,
pressure_surface,,,,,
global_irradiance,,,,,
direct_irradiance,,,,,
diffuse_irradiance,,,,,
cloud_cover,,,,,
wind_speed,,,,,


### Save Statistical Features

In [22]:
forecast_image_stats.to_csv('Forecast_Open_Meteo_statistical_features.csv', index=False)

## Check Data

In [23]:
# Count the number of unique values for each column
print(forecast_image_stats.nunique())

mean    0
std     0
min     0
max     0
skew    0
dtype: int64


In [24]:
# Count the number of unique values for each column
print(forecast_open_meteo_csv.nunique())

timestamp             5292
temperature           5292
humidity              5292
pressure_msl          5292
pressure_surface      5292
global_irradiance     5292
direct_irradiance     5292
diffuse_irradiance    5292
cloud_cover           5292
wind_speed            5292
wind_direction        5292
precipitation         4752
rain                  4156
showers               3876
snowfall               749
weather_code          5292
dtype: int64


## Data Summary

In [25]:
# print(f'Length of entire dataset: {len(forecast_images)}')
# print('\n')

# # Print length of each unique label
# for label, count in Counter(list(forecast_labels)).items():
#     print(f'Length of {label} class: {count} ---------------- {count/len(forecast_labels)*100:.2f}%')

# print('\n')
# # Print if the classes are balanced or not
# print('The classes are balanced.' if len(set(forecast_labels)) == len(forecast_labels) else 'The classes are imbalanced.')

# print('\n')
# # List the miss values in forecast_open_meteo_csv['patient_id'] compared to forecast_info_csv_df['patient_id']
# if forecast_open_meteo_csv['patient_id'].isin(forecast_info_csv_df['patient_id']).all():
#     print('All values are the same in both datasets.')
# else:
#     print('The original dataset and the one from kaggle are not the same:')
#     print('\n')
#     print(f"{len(forecast_info_csv_df[~forecast_info_csv_df['patient_id'].isin(forecast_open_meteo_csv['patient_id'])]['patient_id'].sort_values().unique())} Missing values in the Kaggle dataset:")
#     print(forecast_info_csv_df[~forecast_info_csv_df['patient_id'].isin(forecast_open_meteo_csv['patient_id'])]['patient_id'].sort_values().unique())
#     print('\n')
#     print(f"{len(forecast_open_meteo_csv[~forecast_open_meteo_csv['patient_id'].isin(forecast_info_csv_df['patient_id'])]['patient_id'].sort_values().unique())} Extra value than original data set:")
#     print(forecast_open_meteo_csv[~forecast_open_meteo_csv['patient_id'].isin(forecast_info_csv_df['patient_id'])]['patient_id'].sort_values().unique())

## Data Visualization

### Histogram of Labels

In [26]:
plot_histogram(forecast_open_meteo_csv, 'label', 'Alzheimers Class Count', 'Label', 'Count','px')

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['timestamp', 'temperature', 'humidity', 'pressure_msl', 'pressure_surface', 'global_irradiance', 'direct_irradiance', 'diffuse_irradiance', 'cloud_cover', 'wind_speed', 'wind_direction', 'precipitation', 'rain', 'showers', 'snowfall', 'weather_code'] but received: label

### Mean vs Std of Images

In [None]:
plot_scatter(forecast_image_stats, 'mean', 'std', 'Mean vs STD of Images Per Class', 'Mean', 'STD', 'label', 'px')

### Mean of Images vs Skew of Image Histograms

In [None]:
plot_scatter(forecast_image_stats, 'mean', 'skew', 'Mean vs Skew of Images Per Class', 'Mean', 'Skew', 'label', 'px')

### Std of Images vs Skew of Image Histograms

In [None]:
plot_scatter(forecast_image_stats, 'std', 'skew', 'STD vs Skew of Images Per Class', 'STD', 'Skew', 'label', 'px')

### 3D Scatter Plot Using Mean, Std, Skew

In [None]:
dist = forecast_open_meteo_csv['label'].value_counts(normalize=True)
dist = dist.reset_index()
dist.columns = ['label', 'percentage']
dist['percentage'] = dist['percentage'] * 100
dist = dist.sort_values(by='label')

sampled_image_stats = pd.DataFrame()
for label in dist['label'].unique():
    sampled_image_stats = pd.concat([sampled_image_stats, forecast_image_stats[forecast_open_meteo_csv['label'] == label].sample(n=100, random_state=42)])

sampled_image_stats = sampled_image_stats.reset_index(drop=True)

In [None]:
plot_3d_scatter(sampled_image_stats, 'mean', 'std', 'skew', '3D Scatter Plot of Image Statistics', 'Mean', 'STD', 'Skew', 'label', 'px')

### Distributions

In [None]:
plot_box_plot(forecast_image_stats, 'label', 'mean', 'Mean of Images Per Class', 'Label', 'Mean', 'px')

In [None]:
plot_box_plot(forecast_image_stats, 'label', 'std', 'STD of Images Per Class', 'Label', 'STD', 'px')

In [None]:
plot_box_plot(forecast_image_stats, 'label', 'skew', 'Skew of Images Per Class', 'Label', 'Skew', 'px')

## Neuroimaging Data Exploration

For neuroimaging data (especially in OASIS-3), you might want to visualize MRI or PET scans.

In [None]:
# Extracting data from an MRI scan
# mri_data = mri_example.get_fdata()
# mri_data = load_nii_data(forecast_path)

In [None]:
# Visualizing a slice
# plt.imshow(mri_data[:, :, mri_data.shape[2] // 2], cmap='gray')
# plt.title('Middle Slice of an MRI Scan')
# plt.show()

# Advanced Analysis (Optional)

Depending on your project's goals, you might conduct more advanced analyses, such as PCA for dimensionality reduction, especially useful for high-dimensional neuroimaging data.

In [None]:
# Example: PCA on a feature set
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features)  # 'features' would be your dataset
# pca = PCA(n_components=2)
# principal_components = pca.fit_transform(features_scaled)