Some typical NumPy 'random' functions:

In [1]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

[[0.05465629 0.36348264]
 [0.40592087 0.95105827]
 [0.25960668 0.77878719]]
[-1.88754397  1.99723337 -1.12694515 -0.06739986 -0.73945874]
[ 9 21 68 50 44 45 12 45 81 17]
[4 2 9]
[ 1.66274897  0.17160826 -2.12082309 -0.51022735  0.44487253 -0.5292614
  1.9623204  -0.53139903  0.2492942  -0.88148381]
None


AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [2]:
import pandas as pd
import numpy as np
import time

# 1. Generate 1 million random data items using Pandas
start_time_pd = time.time()
data_pd = pd.DataFrame(np.random.randint(1, 101, size=(1000000, 1)), columns=['RandomData'])
time_pd = time.time() - start_time_pd

# 2. Generate 1 million random data items using NumPy
start_time_np = time.time()
data_np = np.random.randint(1, 101, size=1000000)
time_np = time.time() - start_time_np

# 3. Print time and results
print(f"Pandas time: {time_pd:.4f} seconds")
print(f"NumPy time: {time_np:.4f} seconds")

# 3.2. Compare speed
if time_pd < time_np:
    print("Pandas is faster.")
else:
    print("NumPy is faster.")


Pandas time: 0.0070 seconds
NumPy time: 0.0070 seconds
Pandas is faster.


AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

# 1. Using only pandas, load the dataset and calculate means
start_time_pd = time.time()
df = pd.read_csv('sleep_health.csv')

# 去掉列名的首尾空格
df.columns = df.columns.str.strip()

mean_values_pd = {
    'Sleep Duration': df['Sleep Duration'].mean(),
    'Systolic Blood Pressure': df['Systolic blood pressure'].mean(),
    'Diastolic Blood Pressure': df['Diastolic blood pressure'].mean(),
    'Heart Rate': df['Heart Rate'].mean(),
    'Daily Steps': df['Daily Steps'].mean()
}
time_pd_step1 = time.time() - start_time_pd

# 2. Using only NumPy to calculate means
start_time_np = time.time()
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

mean_values_np = {
    'Sleep Duration': np.mean(data['Sleep Duration']),
    'Systolic Blood Pressure': np.mean(data['Systolic blood pressure']),
    'Diastolic Blood Pressure': np.mean(data['Diastolic blood pressure']),
    'Heart Rate': np.mean(data['Heart Rate']),
    'Daily Steps': np.mean(data['Daily Steps'])
}
time_np_step1 = time.time() - start_time_np

# 3. Using only pandas to calculate correlation
correlation_pd = df.corr()
correlation_sleep_age = df['Sleep Duration'].corr(df['Age'])
correlation_sleep_heart_rate = df['Sleep Duration'].corr(df['Heart Rate'])
correlation_sleep_daily_steps = df['Sleep Duration'].corr(df['Daily Steps'])
time_pd_step3 = time.time() - (start_time_pd + time_pd_step1)

# 4. Using only NumPy to calculate correlation
correlation_np_sleep_age = np.corrcoef(data['Sleep Duration'], data['Age'])[0, 1]
correlation_np_sleep_heart_rate = np.corrcoef(data['Sleep Duration'], data['Heart Rate'])[0, 1]
correlation_np_sleep_daily_steps = np.corrcoef(data['Sleep Duration'], data['Daily Steps'])[0, 1]
time_np_step3 = time.time() - (start_time_np + time_np_step1)

# 5. Calculate standard deviation for 'Sleep Duration' using Pandas
std_sleep_duration_pd = df['Sleep Duration'].std()
time_pd_step5 = time.time() - (start_time_pd + time_pd_step1 + time_pd_step3)

# 6. Calculate standard deviation for 'Sleep Duration' using NumPy
std_sleep_duration_np = np.std(data['Sleep Duration'])
time_np_step5 = time.time() - (start_time_np + time_np_step1 + time_np_step3)

# 7. Calculate total time
total_time_pd = time_pd_step1 + time_pd_step3 + time_pd_step5
total_time_np = time_np_step1 + time_np_step3 + time_np_step5

# 5.1. Compare speed
print(f"Pandas Mean Calculation Time: {time_pd_step1:.4f} seconds")
print(f"NumPy Mean Calculation Time: {time_np_step1:.4f} seconds")
print(f"Pandas Correlation Calculation Time: {time_pd_step3:.4f} seconds")
print(f"NumPy Correlation Calculation Time: {time_np_step3:.4f} seconds")
print(f"Pandas Standard Deviation Time: {time_pd_step5:.4f} seconds")
print(f"NumPy Standard Deviation Time: {time_np_step5:.4f} seconds")
print(f"Total Time for Pandas: {total_time_pd:.4f} seconds")
print(f"Total Time for NumPy: {total_time_np:.4f} seconds")

if total_time_pd < total_time_np:
    print("Pandas is faster.")
else:
    print("NumPy is faster.")


AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset using Pandas
df = pd.read_csv('sleep_health.csv')

# Print the columns of the DataFrame
print("Pandas列名：", df.columns.tolist())

# 1. Plot distributions using Pandas
plt.figure(figsize=(15, 12))

# 1.1 Age
plt.subplot(3, 2, 1)
sns.histplot(df['Age'], kde=True)
plt.title('Age Distribution')

# 1.2 Sleep Duration
plt.subplot(3, 2, 2)
sns.histplot(df['Sleep Duration'], kde=True)
plt.title('Sleep Duration Distribution')

# 1.3 Quality of Sleep
plt.subplot(3, 2, 3)
sns.histplot(df['Quality of Sleep'], kde=True)
plt.title('Quality of Sleep Distribution')

# 1.4 Physical Activity Level
plt.subplot(3, 2, 4)
sns.histplot(df['Physical Activity Level'], kde=True)
plt.title('Physical Activity Level Distribution')

# 1.5 Stress Level
plt.subplot(3, 2, 5)
sns.histplot(df['Stress Level'], kde=True)
plt.title('Stress Level Distribution')

# 1.6 Heart Rate
plt.subplot(3, 2, 6)
sns.histplot(df['Heart Rate'], kde=True)
plt.title('Heart Rate Distribution')

plt.tight_layout()
plt.show()

# Load the dataset using NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# Print the field names from NumPy
print("NumPy字段名：", data.dtype.names)

# Strip whitespace from column names if necessary
if data.dtype.names:
    data.dtype.names = [name.strip() for name in data.dtype.names]

plt.figure(figsize=(15, 12))

# 2.1 Age
plt.subplot(3, 2, 1)
plt.hist(data['Age'], bins=30, alpha=0.7)
plt.title('Age Distribution')

# 2.2 Sleep Duration
plt.subplot(3, 2, 2)
plt.hist(data['Sleep Duration'], bins=30, alpha=0.7)
plt.title('Sleep Duration Distribution')

# 2.3 Quality of Sleep
plt.subplot(3, 2, 3)
plt.hist(data['Quality of Sleep'], bins=30, alpha=0.7)
plt.title('Quality of Sleep Distribution')

# 2.4 Physical Activity Level
plt.subplot(3, 2, 4)
plt.hist(data['Physical Activity Level'], bins=30, alpha=0.7)
plt.title('Physical Activity Level Distribution')

# 2.5 Stress Level
plt.subplot(3, 2, 5)
plt.hist(data['Stress Level'], bins=30, alpha=0.7)
plt.title('Stress Level Distribution')

# 2.6 Heart Rate
plt.subplot(3, 2, 6)
plt.hist(data['Heart Rate'], bins=30, alpha=0.7)
plt.title('Heart Rate Distribution')

plt.tight_layout()
plt.show()


AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset using Pandas
df = pd.read_csv('sleep_health.csv')

# 确保只使用数值列进行绘图和计算
numeric_df = df.select_dtypes(include=[np.number])

# 1. Pairplot using Pandas
sns.pairplot(numeric_df, diag_kind='kde', markers='o')
plt.suptitle('Pairplot of Sleep Health Variables', y=1.02)
plt.show()

# 2. Heatmap for correlations using Pandas
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# 3. Violin plot for Sleep Duration by Quality of Sleep using Pandas
plt.figure(figsize=(8, 6))
sns.violinplot(x='Quality of Sleep', y='Sleep Duration', data=df)
plt.title('Violin Plot of Sleep Duration by Quality of Sleep')
plt.show()

# Load the dataset using NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# 4. Pairplot using NumPy (scatter matrix)
from pandas.plotting import scatter_matrix
scatter_matrix(pd.DataFrame(data), alpha=0.2, figsize=(15, 15), diagonal='kde')
plt.suptitle('Scatter Matrix of Sleep Health Variables', y=1.02)
plt.show()

# 5. Custom histogram with multiple categories using NumPy
plt.figure(figsize=(10, 6))
unique_quality = np.unique(data['Quality of Sleep'])
for quality in unique_quality:
    plt.hist(data['Sleep Duration'][data['Quality of Sleep'] == quality], bins=30, alpha=0.5, label=str(quality))
plt.title('Histogram of Sleep Duration by Quality of Sleep')
plt.xlabel('Sleep Duration')
plt.ylabel('Frequency')
plt.legend(title='Quality of Sleep')
plt.show()

# 6. Stacked bar plot for Physical Activity Level and Stress Level using Pandas
plt.figure(figsize=(10, 6))
activity_levels = df['Physical Activity Level'].value_counts()
stress_levels = df['Stress Level'].value_counts()

# 通过重构数据，确保能够叠加显示
activity_levels = activity_levels.reindex(stress_levels.index).fillna(0)

# 绘制堆叠柱状图
plt.bar(activity_levels.index, activity_levels, color='skyblue', label='Physical Activity Level', alpha=0.7)
plt.bar(stress_levels.index, stress_levels, color='salmon', label='Stress Level', alpha=0.5, bottom=activity_levels)
plt.title('Stacked Bar Plot of Activity Level and Stress Level')
plt.ylabel('Count')
plt.legend()
plt.show()
