# Import Libraries

In [4]:
!python --version

Python 3.10.0


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from datetime import datetime 

In [None]:
# Better visualization
plt.style.use('ggplot')

## Declare Constants

In [None]:
SPAIN_RAW_DATA = '../raw-data/Spain_training.csv'

### Logging progress

In [None]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./log/spain_log.txt","a") as f: 
        f.write(timestamp + ' : ' + message + '\n')

# 1. Load Data

In [None]:
# Spain Training

log_progress('Preliminaries complete. Initiating main process')

spain_train = pd.read_csv(SPAIN_RAW_DATA)

log_progress('Data extraction completed. Initiating EDA process')

# 2. Exploratory Data Analysis (EDA)

### 2.1.0 Shape of DataFrames

In [None]:
print("Spain Training Shape:", spain_train.shape)

### 2.1.1 Missing Values

In [None]:
spain_missing_values = spain_train.isnull().sum()
print("Spain Training Data Missing Values:", spain_missing_values)

### 2.2 Descriptive Statistics

In [None]:
display("Spain Training Data Summary:\n", spain_train.describe())

### 2.3 Data Visualization

In [None]:
# Histograms for each feature
for column in spain_train.columns:
    if column != "TARGET":  # Exclude the target column
        plt.figure(figsize=(8, 4))
        sns.histplot(spain_train[column], kde=True)
        plt.title(f"Distribution of {column}")
        plt.show()

In [None]:
# Box Plots to Identify Outliers
for column in spain_train.columns:
    if column != "TARGET":  # Exclude the target column
        plt.figure(figsize=(8, 4))
        sns.boxplot(y=spain_train[column])
        plt.title(f"Boxplot of {column}")
        plt.show()

### 2.4 Correlation Analysis

In [None]:
# Correlation Matrix
correlation_matrix = spain_train.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix for Spain Training Data")
plt.show()

In [None]:
# Scatter Plots
# Example: Scatter Plot of 'nir_p50' vs. 'TARGET'
plt.figure(figsize=(8, 4))
sns.scatterplot(x=spain_train["nir_p50"], y=spain_train["TARGET"])
plt.title("Scatter Plot of nir_p50 vs. TARGET")
plt.xlabel("nir_p50")
plt.ylabel("TARGET")
plt.show()

In [None]:
# Save checkpoint after loading and initial EDA
pickle.dump(spain_train, open("cleaned-data/spain_training_cleaned.pkl", 'wb'))

log_progress('Explorative Data Analysis completed')

# 3. Feature Selection

Most influential columns for classification, based on EDA:

In [None]:
log_progress('Feature selection completed')

# 4. Data Cleaning (Optional)

Remove outliers or handle missing values

In [None]:
# Save checkpoint after cleaning
pickle.dump(spain_train, open("cleaned-data/spain_training_cleaned.pkl", 'wb'))

log_progress('Data cleaning completed')

# 5. Save Cleaned Data

Save the cleaned Spain training data to the cleaned-data folder

In [None]:

spain_train.to_csv("cleaned-data/spain_training_cleaned.csv", index=False)

log_progress('Data saved to CSV file')