In [7]:
print("10 Academy week 0 challenge starter code")

10 Academy week 0 challenge starter code


In [8]:
import pandas as pd
df = pd.read_csv('../data/benin-malanville.csv')
summary = df.describe()
print(summary)

                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      240.559452     167.187516     115.358961     236.589496   
std       331.131327     261.710501     158.691074     326.894859   
min       -12.900000      -7.800000     -12.600000       0.000000   
25%        -2.000000      -0.500000      -2.100000       0.000000   
50%         1.800000      -0.100000       1.600000       4.500000   
75%       483.400000     314.200000     216.300000     463.700000   
max      1413.000000     952.300000     759.200000    1342.300000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      228.883576      28.179683      54.487969       2.121113   
std       316.536515       5.924297      28.073069       1.603466   
min         0.000000      11.000000       2.100000       0.000000   
25%         0.000000      24.2000

In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

Matplotlib is building the font cache; this may take a moment.


## Data Cleaning

In [10]:

def data_clean(data):
    # Replace specific invalid numeric values and infinite values with NaN
    data = data.replace([-1, np.inf, -np.inf], np.nan)

    # Convert invalid or corrupt timestamps to NaN
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce')

    # Fill missing values in numeric columns with the respective column's mean
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    data[numeric_columns] = data[numeric_columns].apply(lambda x: x.fillna(x.mean()))

    # Eliminate any duplicate rows from the dataset
    data.drop_duplicates(inplace=True)

    return data


## Data Analysis

In [11]:
def process_and_analyze(data, location_name, variables=None):
    """
    Processes and performs analysis on solar radiation data for a given location.

    Args:
        data (pandas.DataFrame): The dataset containing solar radiation measurements.
        location_name (str): The name of the location to be analyzed.
        variables (list, optional): A list of specific variable names for analysis.
            If not provided, all numeric columns are analyzed. Defaults to None.

    Returns:
        None
    """

    # Clean the dataset and handle missing values
    data = data_clean(data)

    if data.empty:
        print(f"No data available after cleaning for {location_name}.")
        return

    # Set the 'Timestamp' column as the index for time-series operations
    data.set_index('Timestamp', inplace=True)

    # Generate summary statistics for numerical columns
    summary_stats = data.describe(include=[np.number])
    print(f"\nSummary statistics for {location_name}:")
    print(summary_stats)

    # Analyze time-series patterns and trends
    if not variables:
        variables = ['GHI', 'DNI', 'DHI', 'Tamb']

    plt.figure(figsize=(12, 6))
    sns.lineplot(data=data[variables])
    plt.title(f"Time-series trends for {location_name}")
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.show()

    # Perform correlation analysis between variables
    correlation_matrix = data.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f"Correlation Matrix for {location_name}")
    plt.show()

    # Create scatter plots to visualize relationships and detect anomalies
    plt.figure(figsize=(12, 6))
    sns.scatterplot(data=data, x='Tamb', y='GHI', hue='Precipitation', palette='viridis', s=20)
    plt.title(f"Scatter Plot of Tamb vs. GHI with Precipitation for {location_name}")
    plt.xlabel("Tamb (°C)")
    plt.ylabel("GHI (W/m²)")
    plt.show()

    # Use box plots to identify outliers and distribution spread
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=data[variables])
    plt.title(f"Box Plot for {location_name}")
    plt.show()

    # Plot a histogram to examine the distribution of GHI
    plt.figure(figsize=(12, 6))
    sns.histplot(data['GHI'], kde=True)
    plt.title(f"Histogram of GHI for {location_name}")
    plt.xlabel("GHI (W/m²)")
    plt.ylabel("Count")
    plt.show()

    # Report on any missing values after the cleaning and imputation process
    missing_values = data.isnull().sum()
    print(f"\nMissing values after cleaning and imputing for {location_name}:")
    print(missing_values)
