In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go

# Load raw data
raw_data = pd.read_csv('data/raw_data.csv')

# Display the first few rows of the raw data to understand its structure and content
print("First few rows of the raw data:")
print(raw_data.head())

# Data cleaning and preprocessing
# Remove any rows with missing values to ensure data integrity for subsequent analysis
processed_data = raw_data.dropna()  # Example cleaning step

# Save the cleaned data for further analysis or future use
processed_data.to_csv('data/processed_data.csv', index=False)

# Perform basic analysis
# Generate summary statistics such as mean, median, standard deviation, etc., to get an overview of the data
summary_statistics = processed_data.describe()
summary_statistics.to_csv('data/results/summary_statistics.csv')

# Exploratory Data Analysis (EDA) - Investigating sensor data characteristics

# 1. Data Distribution
# Visualize the distribution of sensor readings using histograms to understand the frequency distribution
fig1 = px.histogram(processed_data, x='sensor_1', title='Distribution of Sensor 1 Data')
fig2 = px.histogram(processed_data, x='sensor_2', title='Distribution of Sensor 2 Data')
fig1.show()
fig2.show()

# 2. Correlation Analysis
# Calculate the correlation matrix to understand the relationships between different sensor readings
correlation_matrix = processed_data.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Visualize the correlation matrix to identify linear relationships between sensor readings
fig3 = px.imshow(correlation_matrix, text_auto=True, title='Correlation Matrix')
fig3.show()

# 3. Scatter Plot
# Create a scatter plot using Plotly & Show the plot interactively
fig4 = px.scatter(processed_data, x='sensor_1', y='sensor_2', title='Scatter Plot of Sensor Data')
fig4.show()

# 4. Box Plot
# Use box plots to detect outliers in sensor data, which can indicate potential anomalies or errors
fig5 = px.box(processed_data, y='sensor_1', title='Box Plot of Sensor 1 Data')
fig6 = px.box(processed_data, y='sensor_2', title='Box Plot of Sensor 2 Data')
fig5.show()
fig6.show()

# Statistical Analysis for Sensor Data Insights

# 1. Descriptive Statistics
# Provide a detailed summary of the sensor data, including central tendency and variability measures
print("Descriptive Statistics:")
print(processed_data.describe())

# 2. Hypothesis Testing
# Conduct a one-sample t-test to determine if the mean of 'sensor_1' readings is significantly different from zero
t_stat, p_value = stats.ttest_1samp(processed_data['sensor_1'], 0)
print(f"T-Statistic: {t_stat}, P-Value: {p_value}")

# 3. Regression Analysis
# Perform a simple linear regression to understand the relationship between 'sensor_1' and 'sensor_2' readings
X = processed_data['sensor_1']
y = processed_data['sensor_2']
X = sm.add_constant(X)  # Adds a constant term to the predictor
model = sm.OLS(y, X).fit()
print(model.summary())

# Sampling Distributions - Understanding variability and distribution of sample means

# 1. Visualize the Distribution of Sample Means
# Take multiple samples from 'sensor_1' data and compute their means to visualize the sampling distribution
sample_means = []
num_samples = 1000
sample_size = 30

for _ in range(num_samples):
    sample = processed_data['sensor_1'].sample(sample_size, replace=True)
    sample_means.append(sample.mean())

# Plot the sampling distribution of the sample means to demonstrate the Central Limit Theorem
fig7 = px.histogram(sample_means, nbins=30, title='Sampling Distribution of the Sample Means (Sensor 1)')
fig7.show()

# 2. Apply the Central Limit Theorem
# Demonstrate how the sampling distribution of the sample means approaches a normal distribution
sample_means_array = np.array(sample_means)
mean_of_sample_means = np.mean(sample_means_array)
std_of_sample_means = np.std(sample_means_array)

# Plot the normal distribution curve for the sample means
x = np.linspace(mean_of_sample_means - 3*std_of_sample_means, mean_of_sample_means + 3*std_of_sample_means, 100)
y = stats.norm.pdf(x, mean_of_sample_means, std_of_sample_means)

fig8 = go.Figure()
fig8.add_trace(go.Histogram(x=sample_means, nbinsx=30, histnorm='probability density'))
fig8.add_trace(go.Scatter(x=x, y=y, mode='lines', name='Normal Distribution'))
fig8.update_layout(title='Central Limit Theorem: Sample Means Distribution vs Normal Distribution',
                   xaxis_title='Sample Means',
                   yaxis_title='Density')
fig8.show()

# Conducting Statistical Experiments and Testing Hypotheses

# 1. A/B Testing
# Compare sensor readings under two different conditions (A and B) to determine if there is a significant difference
condition_A = processed_data[processed_data['condition'] == 'A']['sensor_1']
condition_B = processed_data[processed_data['condition'] == 'B']['sensor_1']

# Perform a two-sample t-test to check if the means of the two conditions are significantly different
t_stat_ab, p_value_ab = stats.ttest_ind(condition_A, condition_B, equal_var=False)
print(f"Two-Sample T-Test: T-Statistic: {t_stat_ab}, P-Value: {p_value_ab}")

# 2. ANOVA Test
# Compare sensor readings under three different conditions (A, B, C) to identify significant differences
condition_C = processed_data[processed_data['condition'] == 'C']['sensor_1']

# Perform a one-way ANOVA to check if there is a significant difference between the means of the three conditions
f_stat, p_value_anova = stats.f_oneway(condition_A, condition_B, condition_C)
print(f"ANOVA Test: F-Statistic: {f_stat}, P-Value: {p_value_anova}")

# 3. Chi-Square Test
# Test if there is a significant association between sensor status and different conditions
contingency_table = pd.crosstab(processed_data['condition'], processed_data['sensor_status'])

# Perform a chi-square test of independence to analyze the association
chi2_stat, p_value_chi2, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-Square Test: Chi2-Statistic: {chi2_stat}, P-Value: {p_value_chi2}, Degrees of Freedom: {dof}")

# Additional data visualization with Plotly
# Create a histogram to visualize the distribution of 'sensor_1' readings
fig9 = px.histogram(processed_data, x='sensor_1', title='Histogram of Sensor 1 Data')
fig9.show()
