In [11]:
%run importFunctions.ipynb

# Load the Sierraleone dataset
sierraleone_df = load_data('C:/Users/KIIT/Desktop/Wind-Solar-Analysis/data/sierraleone-bumbuna.csv')

# Summary Statistics
print('Summary Statistics for Sierraleone:')
print(summary_statistics(sierraleone_df))

Summary Statistics for Sierraleone:
                           Timestamp            GHI            DNI  \
count                         525600  525600.000000  525600.000000   
mean   2022-04-30 12:00:30.000000768     201.957515     116.376337   
min              2021-10-30 00:01:00     -19.500000      -7.800000   
25%              2022-01-29 06:00:45      -2.800000      -0.300000   
50%              2022-04-30 12:00:30       0.300000      -0.100000   
75%              2022-07-30 18:00:15     362.400000     107.000000   
max              2022-10-30 00:00:00    1499.000000     946.000000   
std                              NaN     298.495150     218.652659   

                 DHI           ModA           ModB           Tamb  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      113.720571     206.643095     198.114691      26.319394   
min       -17.900000       0.000000       0.000000      12.300000   
25%        -3.800000       0.000000       0.000000      2

In [None]:
# Data Quality Check
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
data_quality_check(sierraleone_df, columns_to_check)

In [None]:
# Plot GHI, DNI, DHI, and Tamb over time for sierraleone dataset
plot_time_series(sierraleone_df, ['GHI', 'DNI', 'DHI', 'Tamb'], date_column='Timestamp')

# Evaluate the impact of cleaning on ModA and ModB for sierraleone dataset
evaluate_cleaning_impact(sierraleone_df, ['ModA', 'ModB'], cleaning_column='Cleaning', date_column='Timestamp')

In [None]:
# Define the columns to include in the heatmap
columns_to_include = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
plot_correlation_heatmap(sierraleone_df, columns_to_include)

# Use the same columns for the pair plot
plot_pairplot(sierraleone_df, columns_to_include)

# Define the wind-related columns
wind_columns = ['WS', 'WSgust', 'WD']
# Combine with irradiance columns for the scatter matrix
plot_wind_scatter_matrix(sierraleone_df, wind_columns, ['GHI', 'DNI', 'DHI'])

In [None]:
# Call the function with your dataset
plot_wind_polar(sierraleone_df, 'WS', 'WD')


# Call the function with your dataset
plot_wind_direction_variability(sierraleone_df, 'WD')

In [None]:
"""Temperature Analysis"""

# Scatter plot to visualize relationship
plot_temperature_vs_rh(sierraleone_df, 'Tamb', 'RH', 'GHI', 'DNI', 'DHI')

# Correlation Analysis
correlation_analysis(sierraleone_df, ['Tamb', 'RH', 'GHI', 'DNI', 'DHI'])


In [None]:
# Histogram
# List of variables to plot histograms for
columns = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']

# Plot histograms for the sierraleone dataset
plot_histograms(sierraleone_df, columns)

In [None]:
# Z-Score Analysis 

# List of variables to perform Z-score analysis on
columns = ['GHI', 'DNI', 'DHI', 'Tamb', 'WS']

# Calculate Z-scores and flag outliers for the sierraleone dataset
sierraleone_z_scores = calculate_z_scores(sierraleone_df, columns)

# Display the flagged outliers
outliers = sierraleone_df[sierraleone_z_scores.filter(like='_outlier').any(axis=1)]
print("Outliers in the sierraleone dataset:")
print(outliers)

""" Z-Scores: Each variable (e.g., GHI, DNI) will have a corresponding Z-score column (e.g., GHI_z_score). The Z-score represents the number of standard deviations a data point is from the mean.

Outliers: The function will create a boolean column (e.g., GHI_outlier) for each variable, where True indicates that the data point is an outlier (i.e., its absolute Z-score is greater than the threshold).  """


In [None]:
# Bubble chart of GHI vs. Tamb vs. WS, with bubble size representing RH (Relative Humidity)
plot_bubble_chart(
    df=sierraleone_df, 
    x_column='GHI', 
    y_column='Tamb', 
    size_column='WS', 
    color_column='RH', 
    title='GHI vs. Tamb vs. WS (Bubble Size) and RH (Color)'
)


In [None]:
# Handle missing values
sierraleone_df_cleaned= handle_missing_values(sierraleone_df)

# Handle anomalies (e.g., negative values)
sierraleone_df_cleaned = handle_anomalies(sierraleone_df, columns=['GHI', 'DNI', 'DHI', 'Tamb', 'ModA', 'ModB', 'WS', 'WSgust'])

# Handle outliers
sierraleone_df_cleaned = handle_outliers(sierraleone_df, columns=['GHI', 'DNI', 'DHI', 'Tamb', 'ModA', 'ModB', 'WS', 'WSgust'])

import matplotlib.pyplot as plt

# Example: Plotting GHI vs. Time
plt.figure(figsize=(10, 6))
plt.plot(sierraleone_df_cleaned['Timestamp'], sierraleone_df_cleaned['GHI'], label='GHI')
plt.xlabel('Date')
plt.ylabel('GHI')
plt.title('GHI Over Time')
plt.legend()
plt.show()

# Display the first few rows of the cleaned DataFrame
sierraleone_df_cleaned.head()