Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

start_time_pandas = time.time() 
pandas_df = pd.DataFrame({'Random Data': np.random.randint(1, 101, size=1000000)})
end_time_pandas = time.time()  
pandas_time = end_time_pandas - start_time_pandas 
start_time_numpy = time.time()  
numpy_array = np.random.randint(1, 101, size=1000000)
end_time_numpy = time.time() 
numpy_time = end_time_numpy - start_time_numpy 

print(f"Pandas time: {pandas_time} seconds")
print(f"NumPy time: {numpy_time} seconds")

if pandas_time < numpy_time:
    print("Pandas was faster.")
else:
    print("NumPy was faster.")

AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
import pandas as pd
import numpy as np
import time

start_time_pandas = time.time()

pandas_data = pd.read_csv('sleep_health.csv')

print("Columns in the dataset:", pandas_data.columns)

mean_sleep_duration_pandas = pandas_data['Sleep Duration'].mean()
mean_systolic_bp_pandas = pandas_data['Systolic blood pressure'].mean()
mean_diastolic_bp_pandas = pandas_data['Diastolic blood pressure'].mean()
mean_heart_rate_pandas = pandas_data['Heart Rate'].mean()
mean_daily_steps_pandas = pandas_data['Daily Steps'].mean()

print(f"Pandas Mean Sleep Duration: {mean_sleep_duration_pandas}")
print(f"Pandas Mean Systolic BP: {mean_systolic_bp_pandas}")
print(f"Pandas Mean Diastolic BP: {mean_diastolic_bp_pandas}")
print(f"Pandas Mean Heart Rate: {mean_heart_rate_pandas}")
print(f"Pandas Mean Daily Steps: {mean_daily_steps_pandas}")

numeric_data = pandas_data.select_dtypes(include=[np.number])
correlation_matrix_pandas = numeric_data.corr()

print("Pandas Correlation Matrix:")
print(correlation_matrix_pandas)

corr_sleep_age_pandas = pandas_data['Sleep Duration'].corr(pandas_data['Age'])
corr_sleep_heart_rate_pandas = pandas_data['Sleep Duration'].corr(pandas_data['Heart Rate'])
corr_sleep_daily_steps_pandas = pandas_data['Sleep Duration'].corr(pandas_data['Daily Steps'])

print(f"Pandas Correlation (Sleep Duration and Age): {corr_sleep_age_pandas}")
print(f"Pandas Correlation (Sleep Duration and Heart Rate): {corr_sleep_heart_rate_pandas}")
print(f"Pandas Correlation (Sleep Duration and Daily Steps): {corr_sleep_daily_steps_pandas}")

std_sleep_duration_pandas = pandas_data['Sleep Duration'].std()
print(f"Pandas Standard Deviation for Sleep Duration: {std_sleep_duration_pandas}")

end_time_pandas = time.time()
pandas_time = end_time_pandas - start_time_pandas
start_time_numpy = time.time()

data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

print("Columns in the NumPy dataset:", data.dtype.names)

age = data['Age'].astype(int)
sleep_duration = data['Sleep_Duration'].astype(float) 
systolic_bp = data['Systolic_blood_pressure'].astype(float)
diastolic_bp = data['Diastolic_blood_pressure'].astype(float)
heart_rate = data['Heart_Rate'].astype(float)
daily_steps = data['Daily_Steps'].astype(float)

mean_sleep_duration_numpy = np.mean(sleep_duration)
mean_systolic_bp_numpy = np.mean(systolic_bp)
mean_diastolic_bp_numpy = np.mean(diastolic_bp)
mean_heart_rate_numpy = np.mean(heart_rate)
mean_daily_steps_numpy = np.mean(daily_steps)

print(f"NumPy Mean Sleep Duration: {mean_sleep_duration_numpy}")
print(f"NumPy Mean Systolic BP: {mean_systolic_bp_numpy}")
print(f"NumPy Mean Diastolic BP: {mean_diastolic_bp_numpy}")
print(f"NumPy Mean Heart Rate: {mean_heart_rate_numpy}")
print(f"NumPy Mean Daily Steps: {mean_daily_steps_numpy}")

corr_sleep_age_numpy = np.corrcoef(sleep_duration, age)[0, 1]
corr_sleep_heart_rate_numpy = np.corrcoef(sleep_duration, heart_rate)[0, 1]
corr_sleep_daily_steps_numpy = np.corrcoef(sleep_duration, daily_steps)[0, 1]

print(f"NumPy Correlation (Sleep Duration and Age): {corr_sleep_age_numpy}")
print(f"NumPy Correlation (Sleep Duration and Heart Rate): {corr_sleep_heart_rate_numpy}")
print(f"NumPy Correlation (Sleep Duration and Daily Steps): {corr_sleep_daily_steps_numpy}")

std_sleep_duration_numpy = np.std(sleep_duration)
print(f"NumPy Standard Deviation for Sleep Duration: {std_sleep_duration_numpy}")

end_time_numpy = time.time()
numpy_time = end_time_numpy - start_time_numpy

print(f"Pandas operations time: {pandas_time} seconds")
print(f"NumPy operations time: {numpy_time} seconds")

if pandas_time < numpy_time:
    print("Pandas was faster.")
else:
    print("NumPy was faster.")

AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

pandas_data = pd.read_csv('sleep_health.csv')
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

start_time_pandas = time.time()

plt.figure(figsize=(10, 6))
sns.histplot(pandas_data['Age'], bins=20, kde=True)
plt.title('Pandas: Distribution of Age')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(pandas_data['Sleep Duration'], bins=20, kde=True)
plt.title('Pandas: Distribution of Sleep Duration')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(pandas_data['Quality of Sleep'], bins=10, kde=True)
plt.title('Pandas: Distribution of Quality of Sleep')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='Quality of Sleep', y='Sleep Duration', data=pandas_data)
plt.title('Pandas: Sleep Duration based on Quality of Sleep')
plt.show()

end_time_pandas = time.time()
pandas_plot_time = end_time_pandas - start_time_pandas

start_time_numpy = time.time()

plt.figure(figsize=(10, 6))
plt.hist(data['Age'], bins=20, alpha=0.7)
plt.title('NumPy: Distribution of Age')
plt.show()

plt.figure(figsize=(10, 6))
plt.hist(data['Sleep_Duration'], bins=20, alpha=0.7)
plt.title('NumPy: Distribution of Sleep Duration')
plt.show()

plt.figure(figsize=(10, 6))
plt.hist(data['Quality_of_Sleep'], bins=10, alpha=0.7)
plt.title('NumPy: Distribution of Quality of Sleep')
plt.show()

quality_of_sleep_list = [data['Sleep_Duration'][data['Quality_of_Sleep'] == i] for i in np.unique(data['Quality_of_Sleep'])]
plt.boxplot(quality_of_sleep_list, labels=np.unique(data['Quality_of_Sleep']))
plt.title('NumPy: Sleep Duration based on Quality of Sleep')
plt.show()

end_time_numpy = time.time()
numpy_plot_time = end_time_numpy - start_time_numpy

print(f"Pandas plotting time: {pandas_plot_time} seconds")
print(f"NumPy plotting time: {numpy_plot_time} seconds")

if pandas_plot_time < numpy_plot_time:
    print("Pandas plotting was faster")
else:
    print("NumPy plotting was faster")

AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pandas_data = pd.read_csv('sleep_health.csv')

plt.figure(figsize=(10, 6))
sns.heatmap(pandas_data.corr(), annot=True, cmap='coolwarm')
plt.title('Pandas: Correlation Matrix Heatmap')
plt.show()

sns.jointplot(x='Age', y='Sleep Duration', data=pandas_data, kind='scatter')
plt.title('Pandas: Joint Plot of Age and Sleep Duration')
plt.show()

sns.pairplot(pandas_data[['Age', 'Sleep Duration', 'Heart Rate', 'Daily Steps']])
plt.title('Pandas: Pairplot of Variables')
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='BMI Category', data=pandas_data)
plt.title('Pandas: Count Plot of BMI Category')
plt.show()

plt.figure(figsize=(10, 6))
sns.violinplot(x='Stress Level', y='Sleep Duration', data=pandas_data)
plt.title('Pandas: Violin Plot of Sleep Duration by Stress Level')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

correlation_matrix = np.corrcoef([data['Age'], data['Sleep_Duration'], data['Heart_Rate'], data['Daily_Steps']])
plt.figure(figsize=(8, 6))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.title('NumPy: Correlation Matrix Heatmap')
plt.xticks(range(len(correlation_matrix)), ['Age', 'Sleep Duration', 'Heart Rate', 'Daily Steps'])
plt.yticks(range(len(correlation_matrix)), ['Age', 'Sleep Duration', 'Heart Rate', 'Daily Steps'])
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(data['Age'], data['Sleep_Duration'], alpha=0.6)
plt.title('NumPy: Scatter Plot of Age and Sleep Duration')
plt.xlabel('Age')
plt.ylabel('Sleep Duration')
plt.show()

variables = ['Age', 'Sleep_Duration', 'Heart_Rate', 'Daily_Steps']
for i in range(len(variables)):
    for j in range(i+1, len(variables)):
        plt.figure(figsize=(8, 6))
        plt.scatter(data[variables[i]], data[variables[j]], alpha=0.6)
        plt.title(f'NumPy: {variables[i]} vs {variables[j]}')
        plt.xlabel(variables[i])
        plt.ylabel(variables[j])
        plt.show()

unique_bmi, counts_bmi = np.unique(data['BMI_Category'], return_counts=True)
plt.figure(figsize=(10, 6))
plt.bar(unique_bmi, counts_bmi)
plt.title('NumPy: Bar Plot of BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Count')
plt.show()

stress_levels = np.unique(data['Stress_Level'])
box_data = [data['Sleep_Duration'][data['Stress_Level'] == sl] for sl in stress_levels]
plt.boxplot(box_data, labels=stress_levels)
plt.title('NumPy: Box Plot of Sleep Duration by Stress Level')
plt.xlabel('Stress Level')
plt.ylabel('Sleep Duration')
plt.show()