# Andria Beridze, 01905049812, date: 19.10.2025

# Task 1: Python Data Structures & Control Flow

In [153]:
import random
import numpy as np

In [154]:
students = {}

random.seed(10)  # Setting seed for reproducibility

names = ["Andria", "Luka", "Nino",
         "Giorgi", "Sandro","Mariami", 
         "Ana", "Dato", "Elene", "Irakli"]

for i, name in enumerate(names, start=1):
    student_id = "S" + str(i).zfill(3) # filling student id with leading zeros
    students[student_id] = {
        "name": name,
        "scores": [random.randint(51, 100) for _ in range(4)], # generating 4 random scores between 51 and 100
        "attendance": random.randint(10, 30) # generating random attendance between 10 and 30
    }


print(students)
    


{'S001': {'name': 'Andria', 'scores': [87, 53, 78, 81], 'attendance': 28}, 'S002': {'name': 'Luka', 'scores': [51, 64, 80, 82], 'attendance': 18}, 'S003': {'name': 'Nino', 'scores': [92, 61, 53, 84], 'attendance': 25}, 'S004': {'name': 'Giorgi', 'scores': [71, 55, 66, 98], 'attendance': 21}, 'S005': {'name': 'Sandro', 'scores': [53, 77, 59, 89], 'attendance': 21}, 'S006': {'name': 'Mariami', 'scores': [75, 77, 69, 94], 'attendance': 18}, 'S007': {'name': 'Ana', 'scores': [80, 62, 94, 70], 'attendance': 21}, 'S008': {'name': 'Dato', 'scores': [59, 80, 100, 66], 'attendance': 24}, 'S009': {'name': 'Elene', 'scores': [90, 75, 53, 88], 'attendance': 10}, 'S010': {'name': 'Irakli', 'scores': [66, 59, 63, 70], 'attendance': 27}}


creating functions

In [155]:
def calculate_average(scores: list) -> float:
    return round((sum(scores) / len(scores)), 2)   # all scores divided by number of scores


def assign_grade(average: float) -> str: # A-F scale
    if average >= 90:
        return 'A'
    elif average >= 80:
        return 'B'
    elif average >= 70:
        return 'C'
    elif average >= 60:
        return 'D'
    else:
        return 'F'


def check_eligibility(student_dict: dict, total_classes: int) -> tuple:
    average = round(calculate_average(student_dict["scores"]), 2) # average score for the student
    attendance = round(student_dict["attendance"]/total_classes, 2)*100 # attendance percentage
    string = f"attendance: {attendance}%, average point: {average}"# string to explain the result
    if attendance >= 75 and average >= 60:
        return (f"Passed:{True}", f"reason: {string}")# eligible
    return (f"Passed: {False}", f"reason: {string}")# not eligible

def find_top_performers(students: dict, n: int) -> list:
    list_of_top_students = []
    for student, info in students.items():
        tuple = (student, calculate_average(info["scores"]))
        list_of_top_students.append(tuple)
    list_of_top_students.sort(key=lambda x: x[1], reverse=True) # sorting by average score in descending order
    return list_of_top_students[:n] # returning top n students

def generate_report(students: dict) -> dict:
    # defining variables for the report data
    total_students = len(students)
    passed_count = 0
    failed_count = 0
    highest_score = -1
    lowest_score = 101
    class_average = 0
    average_attendance_rate = 0
    
    #going through each student to collect data
    for student, info in students.items():
        average_score = calculate_average(info["scores"]) # average score for the student
        class_average += average_score # summing up for class average
        attendance_rate = round(info["attendance"]/30, 2)*100 # attendance rate for the student
        average_attendance_rate += attendance_rate # summing up for average attendance rate
        highest_score = max(average_score, highest_score)   # finding highest score
        lowest_score = min(average_score, lowest_score)   # counting passed and failed students
        if average_score >= 60 and attendance_rate >= 75:   # checking eligibility
            passed_count += 1
        else:
            failed_count += 1
    
    class_average = round(class_average / total_students, 2) # calculating class average
    average_attendance_rate = round(average_attendance_rate / total_students, 2) # calculating average attendance rate
    
    return {
        "total_students": total_students,
        "passed": passed_count,
        "failed": failed_count,
        "highest_score": highest_score,
        "lowest_score": lowest_score,
        "class_average": class_average,
        "average_attendance_rate": average_attendance_rate
    }


using defined functions for analysis

In [156]:
report = generate_report(students)

print("=== Course statistics: ===")
for key, value in report.items():
    print(f"{key}: {value}")
    
print("\n=== top 5 performers: ===")
top_performers = find_top_performers(students, 5)
i = 1
for student, avg in top_performers:
    print(f"{i}. {student}: {avg}")
    i += 1

print("\n=== students who failed: ===")
for student, info in students.items():
    if check_eligibility(info, 30)[0] == "Passed: False":
        reason = check_eligibility(info, 30)[1]
        print(f"{student} - {reason}")
        
print("\n=== grade distribution: ===")
my_grades = {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'F': 0}
for student, info in students.items():
    average_score = calculate_average(info["scores"])
    grade = assign_grade(average_score)
    my_grades[grade] += 1
    
for grade, count in my_grades.items():
    print(f"Grade {grade}: {count} students")

=== Course statistics: ===
total_students: 10
passed: 4
failed: 6
highest_score: 78.75
lowest_score: 64.5
class_average: 73.1
average_attendance_rate: 70.9

=== top 5 performers: ===
1. S006: 78.75
2. S007: 76.5
3. S009: 76.5
4. S008: 76.25
5. S001: 74.75

=== students who failed: ===
S002 - reason: attendance: 60.0%, average point: 69.25
S004 - reason: attendance: 70.0%, average point: 72.5
S005 - reason: attendance: 70.0%, average point: 69.5
S006 - reason: attendance: 60.0%, average point: 78.75
S007 - reason: attendance: 70.0%, average point: 76.5
S009 - reason: attendance: 33.0%, average point: 76.5

=== grade distribution: ===
Grade A: 0 students
Grade B: 0 students
Grade C: 7 students
Grade D: 3 students
Grade F: 0 students


# Task 2: NumPy Arrays & Operations

In [157]:
np.random.seed(42)


# creating temperature array
temperatures = np.random.uniform(-10.0, 40.0, size=(365, 5))# Generating random temperatures for a year
temperatures = np.round(temperatures, 2) # rounding to 2 decimal places
print(f"shape: {temperatures.shape}")
print(f"dimensions: {temperatures.ndim}")
print(f"data type: {temperatures.dtype}")
print(f"size: {temperatures.size}")

print(temperatures)

# creating sales array
sales = np.random.randint(1000, 5001, size=(12, 4)) # Generating random sales data for a year

#identity matrix
identity_matrix = np.eye(5)

# array of evenly spaced numbers
apart = np.linspace(0, 100, num= 50)
apart = np.round(apart, 2)



shape: (365, 5)
dimensions: 2
data type: float64
size: 1825
[[ 8.73 37.54 26.6  19.93 -2.2 ]
 [-2.2  -7.1  33.31 20.06 25.4 ]
 [-8.97 38.5  31.62  0.62 -0.91]
 ...
 [23.02 28.22  3.25 -8.95 -5.89]
 [38.39  4.77 28.46 21.23  9.1 ]
 [ 0.28 -3.93 20.75 28.73 22.2 ]]


### 1. Basic Slicing:


In [158]:
# january data
january_data = temperatures[0:31, :]  # first 31 days

# summer data
summer_data = temperatures[151:243, :]  # June  to August 

# weekends data
weekends_data = temperatures[5::7, :]  # every 7th day starting from day 5

print(f"=== january data ===")
print(january_data)
print(f"\n=== summer data")
print(summer_data)
print(f"\n===weekends data")
print(weekends_data)


=== january data ===
[[ 8.730e+00  3.754e+01  2.660e+01  1.993e+01 -2.200e+00]
 [-2.200e+00 -7.100e+00  3.331e+01  2.006e+01  2.540e+01]
 [-8.970e+00  3.850e+01  3.162e+01  6.200e-01 -9.100e-01]
 [-8.300e-01  5.210e+00  1.624e+01  1.160e+01  4.560e+00]
 [ 2.059e+01 -3.030e+00  4.610e+00  8.320e+00  1.280e+01]
 [ 2.926e+01 -2.000e-02  1.571e+01  1.962e+01 -7.680e+00]
 [ 2.038e+01 -1.470e+00 -6.750e+00  3.744e+01  3.828e+01]
 [ 3.042e+01  5.230e+00 -5.120e+00  2.421e+01  1.201e+01]
 [-3.900e+00  1.476e+01 -8.280e+00  3.547e+01  2.940e+00]
 [ 2.313e+01  5.590e+00  1.600e+01  1.734e+01 -7.600e-01]
 [ 3.848e+01  2.876e+01  3.697e+01  3.474e+01  1.989e+01]
 [ 3.609e+01 -5.580e+00 -2.000e-01 -7.740e+00  6.270e+00]
 [ 9.430e+00  3.570e+00  3.144e+01  7.840e+00  4.050e+00]
 [ 1.713e+01 -2.950e+00  3.011e+01 -6.270e+00  3.934e+01]
 [ 2.861e+01 -6.000e-02 -9.720e+00  3.077e+01  2.534e+01]
 [ 2.645e+01  2.856e+01 -6.300e+00  7.920e+00 -4.210e+00]
 [ 3.316e+01  2.116e+01  6.540e+00 -6.820e+00  5.55

### 2. Boolean Indexing:

In [159]:
# temperatures above 35 degrees
high_temp = temperatures[temperatures > 35.0]
# freezing temperatures for each city
freezing_temps = {}
for city in range(temperatures.shape[1]):
    freezing_days = temperatures[temperatures[:, city] < 0.0, city]
    freezing_temps[f"City_{city+1}"] = freezing_days
    
# mask for comfortable days
mask = (temperatures > 15.0) & (temperatures < 25.0)

# setting temperatures below -5 to -5.0
temperatures[temperatures < -5.0] = -5.0


print(f"high temperature: {high_temp}")
print(f"freezing temperature: {freezing_temps}")





high temperature: [37.54 38.5  37.44 38.28 35.47 38.48 36.97 36.09 39.34 35.38 36.48 37.15
 38.59 38.12 35.41 39.28 36.84 36.23 35.02 38.65 37.02 37.7  35.74 36.42
 38.33 38.18 36.81 39.5  35.66 38.79 38.13 37.74 37.01 37.07 38.06 35.27
 39.53 36.54 35.13 35.27 37.5  37.53 39.33 37.36 39.3  35.32 38.11 38.16
 38.49 39.65 38.47 35.06 37.52 38.72 39.31 35.55 37.49 36.64 39.36 38.5
 39.99 39.83 37.24 37.7  35.21 37.48 36.33 39.   36.04 39.33 39.12 38.47
 36.47 39.5  36.48 38.25 36.92 35.32 36.99 35.42 37.36 36.61 36.12 36.72
 36.28 39.24 36.04 35.99 35.74 38.05 39.84 37.31 37.42 39.22 35.71 35.72
 36.84 35.87 37.51 39.38 37.2  36.26 35.77 36.22 38.55 37.21 36.95 39.9
 39.2  39.39 37.5  37.24 39.56 37.14 38.46 36.39 39.75 37.78 39.06 38.15
 36.6  36.86 38.67 38.44 39.86 37.54 37.18 39.2  38.41 36.04 39.33 36.81
 38.79 39.82 38.74 39.38 35.04 35.62 35.45 38.57 38.47 38.49 36.83 36.69
 39.33 38.26 37.59 38.5  37.23 36.55 39.97 38.86 37.78 36.35 36.72 38.15
 37.57 39.39 39.56 39.71 37.88 36.3

### 3. Fancy Indexing:


In [160]:
# extracting specific days 
specific_days = temperatures[[0, 100, 200, 300, 364]]

# quarterly averages 
days_per_quarter = 365 // 4
quarterly_averages = []

for i in range(4):
    start_day = i * days_per_quarter
    if i == 3:  # 4 * 91 = 364, so we take till the end of the year
        end_day = 365
    else:
        end_day = start_day + days_per_quarter
    quarter_data = temperatures[start_day:end_day, :]
    quarter_average = np.round(np.mean(quarter_data, axis=0), 2) # calculating each column average
    quarterly_averages.append(quarter_average)
    
# annual average temperatures per city
annual_averages = np.round(np.mean(temperatures, axis=0), 2) # calculating each column average

annual_averages = np.sort(annual_averages)[::-1] # sorting in descending order


print(f"annual averages: {annual_averages}")

annual averages: [15.47 15.42 15.28 14.94 14.49]


### 1. Temperature Analysis:

In [161]:
# mean temperature for each city
annual_averages = np.round(np.mean(temperatures, axis=0), 2) 
# median temperature for each city
annual_medians = np.round(np.median(temperatures, axis=0), 2) 
# standard deviation of temperatures for each city
annual_std_deviation = np.round(np.std(temperatures, axis=0), 2) 
# hottest day of the year
day, city = np.unravel_index(np.argmax(temperatures), temperatures.shape) # getting the index of a maximum temperature
hottest_temp = (temperatures[day, city], day) # temperature, day
# coldest day of the year
day, city = np.unravel_index(np.argmin(temperatures), temperatures.shape)
coldest_temp = (temperatures[day, city], day) # temperature, day
# temperature range for each city
temperature_ranges = np.round(np.ptp(temperatures, axis=0), 2) # ptp calculates max - min for each column
# correlation between cities
correlation_between_cites = np.round(np.corrcoef(temperatures, rowvar=False), 2) 



### 2. Sales Analysis:

In [162]:
# total sales per category
sales_per_category = np.sum(sales, axis=0)
# average monthly sales
average_monthly_sales_per_category = np.round(np.mean(sales, axis=0),2)
# best performing month
best_month = np.argmax(np.sum(sales, axis=1)) # return the index of best month
# best performing category
best_category = np.argmax(np.sum(sales, axis=0)) # return the index of best performing category



### 3. Advanced Computations:

this is the link of where i found moving average formula 

https://www.geeksforgeeks.org/python/how-to-calculate-moving-averages-in-python/

In [163]:
# calculating 7 day moving average

window_size = 7
moving_averages = []


for i in range(temperatures.shape[0] - window_size + 1):
    window = temperatures[i:i+window_size, :]       # select 7 consecutive days for all cities
    window_average = np.round(np.mean(window, axis=0), 2)  # mean per city
    moving_averages.append(window_average)
    


this is the link where i found z score

https://medium.com/@whyamit101/understanding-z-score-with-numpy-bc8b23f81639

In [164]:
mean_of_temperatures = np.mean(temperatures, axis=0)

std = np.std(temperatures,axis=0) # standard deviation

# subtract the mean of each city from each temperature and divide by std from each city
z_score = np.round((temperatures - mean_of_temperatures)/std, 2) 



this is the link of where i found the percentile 

https://www.geeksforgeeks.org/python/numpy-percentile-in-python/

In [165]:
percentiles = [25,50,75] 

city_percentiles = np.percentile(temperatures, percentiles, axis=0) # percentiles per city




# Task 3: Applied Data Analysis 

### Part A: Data Generation & Preparation

In [166]:
# creating an array of random values

dataset = np.empty((100,90,4))

"""
filing in each category:
daily_steps (2000-15000),
calories (1500-3500),
active_minutes (20-180),
avg_heart_rate (60-120)
"""
dataset[:,:,0] = np.random.randint(2000, 15000, size=(100,90))
dataset[:,:,1] = np.random.randint(1500, 3500, size=(100,90))
dataset[:,:,2] = np.random.randint(20, 180, size= (100, 90))
dataset[:,:,3] = np.random.randint(60, 120, size = (100,90))




### Introduce realistic data issues:

In [167]:
shape = np.shape(dataset)
all_elements  = np.size(dataset)
percent_of_nans = 0.05
number_of_nans = int(all_elements * percent_of_nans)

# getting random indices 

nan_indices = np.random.choice(all_elements, number_of_nans, replace=False)

# converting back to i, j, k coordinates

nan_indices = np.unravel_index(nan_indices, shape) 

# assign nans

dataset[nan_indices] = np.nan

# inserting unrealistic data

percent_of_outliers = 0.02
number_of_outliers = int(all_elements*percent_of_outliers)

# getting random indices 
outliers_indices = np.random.choice(all_elements, number_of_outliers, replace=False)

# converting back to i, j, k coordinates
outliers_indices = np.unravel_index(outliers_indices,shape)

# cant have negative values
dataset[outliers_indices] = np.random.randint(-100000, -1, size = number_of_outliers)

# creating meta data

num_users = 100

metadata = np.zeros((num_users, 3))  

# fill metadata
metadata[:, 0] = np.arange(1, num_users + 1)  # user_id
metadata[:, 1] = np.random.randint(18, 71, size=num_users)  # age: 18–70
metadata[:, 2] = np.random.randint(0, 2, size=num_users)    # gender: 0 or 1






### Part B: Data Cleaning & Validation

In [168]:
def handle_missing(data):
    feature_means = np.nanmean(data, axis=(0, 1)) # calculate column mean without nans
    
    # find nan indices
    nan_mask = np.isnan(data)
    
    # replace nan values
    data[nan_mask] = np.take(feature_means, np.where(nan_mask)[2])
    
    return data

def remove_outliers(data, metric_index):
   
    # select the metric 
    metric_data = data[:, :, metric_index]
    
    # compute Q1, Q3, and IQR 
    Q1 = np.percentile(metric_data, 25)
    Q3 = np.percentile(metric_data, 75)
    IQR = Q3 - Q1
    
    # define lower and upper bounds for non-outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # compute median for replacement
    median = np.median(metric_data)
    
    # find outlier positions
    outlier_mask = (metric_data < lower_bound) | (metric_data > upper_bound)
    
    # replace outliers with median
    metric_data[outlier_mask] = median
    
    # put cleaned metric back into dataset
    data[:, :, metric_index] = metric_data
    
    return data


for i in range(dataset.shape[2]):
    dataset= remove_outliers(dataset,i)
    
dataset = handle_missing(dataset)

if np.isnan(dataset).any():
    print("there are still nan values")
else:
    print("no nan values remaining")
    
    



no nan values remaining


### Part C: Comprehensive Analysis

#### 1. User Behavior Patterns:

In [169]:
# average metrics per user over 90 days
user_avg = np.mean(dataset, axis=1) 

print(f"average metrics per user over 90 days: {user_avg}")

# top 10 most active users (by combined z-score of all metrics)
mean_across_users = np.mean(dataset, axis=0)  # mean
std_across_users = np.std(dataset, axis=0)      # std

z_scores = (dataset - mean_across_users) / std_across_users

# combine Z-Scores
combined_z_score = np.sum(z_scores, axis=1) 


# find Top 10 indices
top_10_indices = np.argsort(combined_z_score)[-10:][::-1] # top 10 in decs order

# find users with most consistent activity (lowest std deviation of steps)
steps_data = dataset[:, :, 0] 

# calculate the standard deviation of steps across the 90 days for each user
std_dev_steps_per_user = np.std(steps_data, axis=1) 
print(f"# calculate the standard deviation of steps across the 90 days for each user: {std_dev_steps_per_user}")
# find Top 10 Most Consistent 
most_consistent_indices = np.argsort(std_dev_steps_per_user)[:10]

# classify users into activity levels based on average steps
average_steps = dataset[:, 0] 

# calculate the 25th and 75th percentiles
p25, p75 = np.percentile(average_steps, [25, 75])

# define conditions and choices for classification using np.select
conditions = [
    average_steps < p25,
    (average_steps >= p25) & (average_steps <= p75),
    average_steps > p75
]
choices = ['Low', 'Medium', 'High']

# apply the classification
activity_levels = np.select(conditions, choices, default='Error')

print(f"activity levels: {activity_levels}")




average metrics per user over 90 days: [[ 7.36514814e+03  2.43596550e+03 -1.92026900e+03 -6.53656449e+02]
 [ 8.30524814e+03  2.42489186e+03  4.83931916e+01 -1.84958678e+03]
 [ 6.51716666e+03  7.53952068e+02 -4.65539337e+03  4.18213285e+01]
 [ 7.50750741e+03  9.97049743e+02 -2.84933510e+02 -2.44055945e+03]
 [ 7.49824074e+03  1.51669535e+03 -1.80851559e+03 -2.53604789e+02]
 [ 7.75673333e+03  1.28949303e+03 -3.37942293e+02 -1.76291590e+03]
 [ 7.52579629e+03  9.24383076e+02 -1.86850233e+03 -1.48490779e+03]
 [ 7.51218889e+03  2.51552985e+03 -9.18033510e+02 -8.12776564e+02]
 [ 6.55123333e+03 -2.08774804e+02 -1.71805125e+03  2.04042150e+01]
 [ 6.92382963e+03  2.11881990e+03 -1.23505788e+03 -2.30161558e+02]
 [ 8.45564815e+03  1.31078656e+03 -9.66317919e+02 -1.26613423e+03]
 [ 6.93284444e+03  8.90095350e+02 -1.01824462e+03 -1.27171111e+03]
 [ 6.36612592e+03  1.87318656e+03 -1.01067563e+03 -3.07274457e+02]
 [ 8.08132592e+03  6.00660854e+02 -2.71957348e+03 -4.86953448e+02]
 [ 7.77522222e+03  6.15

### 2. Temporal Trends:

In [170]:
window_size = 7
num_days = dataset.shape[1]
num_metrics = dataset.shape[2]

# population-wide metrics: sum/mean across users
population_daily_avg = np.mean(dataset, axis=0)  # shape: (90, 4)

# 7 day rolling average
rolling_avg = np.empty((num_days - window_size + 1, num_metrics))

for i in range(num_days - window_size + 1):
    rolling_avg[i] = np.mean(population_daily_avg[i:i+window_size], axis=0)

print("7-day rolling average shape:", rolling_avg.shape)

# initialize array: 7 days, metrics
weekly_pattern = np.empty((7, num_metrics))

for day in range(7):
    weekly_pattern[day] = np.mean(population_daily_avg[day::7], axis=0)

print("Weekly pattern (Mon-Sun):\n", weekly_pattern)


# daily difference in total population activity per metric
daily_diff = np.diff(population_daily_avg, axis=0)  # shape: (89, 4)

# positive mean difference -> increasing trend, negative -> decreasing
trend = np.mean(daily_diff, axis=0)
print("Trend per metric (positive=increasing):", trend)


month_days = 30
num_months = num_days // month_days
monthly_avg = np.empty((num_months, num_metrics))

for m in range(num_months):
    start = m * month_days
    end = start + month_days
    monthly_avg[m] = np.mean(population_daily_avg[start:end], axis=0)

# compute growth rates: (current - previous) / previous * 100
growth_rates = (monthly_avg[1:] - monthly_avg[:-1]) / monthly_avg[:-1] * 100

print("Month-over-month growth rates (%):\n", growth_rates)


7-day rolling average shape: (84, 4)
Weekly pattern (Mon-Sun):
 [[ 7628.06384273  1527.36209428  -879.11968807  -954.61822242]
 [ 7173.78307242  1104.62707649 -1076.6581496  -1152.21163546]
 [ 7734.39204738  1401.39209428 -1441.34721917 -1114.70510578]
 [ 6857.40461178  1489.64914274  -765.60676008  -503.78930561]
 [ 7486.33614934  1046.35715697  -828.38659888 -1086.64965929]
 [ 7051.37102162  1305.23270255 -1117.44599085  -920.3681827 ]
 [ 7632.70277361  1465.87669703 -1140.41247149  -991.98852945]]
Trend per metric (positive=increasing): [ -0.55734084   3.85187485 -20.36400143 -22.57567868]
Month-over-month growth rates (%):
 [[ -0.08304088  15.72924555 -38.77139876  35.14798512]
 [  1.05374119  -6.771394    31.12410001  19.39012731]]


### 3. Correlations & Insights:

In [171]:
# average per user over 90 days
user_avg = np.mean(dataset, axis=1)  # shape: (100, 4)

# correlation matrix (4×4) between metrics
corr_matrix = np.corrcoef(user_avg, rowvar=False)

print("Correlation matrix between metrics:\n", np.round(corr_matrix, 2))


# age column from metadata
ages = metadata[:, 1]  

# average steps per user
avg_steps = user_avg[:, 0]

# correlation between age and average steps
age_activity_corr = np.round(np.corrcoef(ages, avg_steps)[0, 1],2)
print("Correlation between age and steps:", age_activity_corr)


genders = metadata[:, 2]

# mean activity per gender (average steps)
male_avg = np.mean(avg_steps[genders == 1])
female_avg = np.mean(avg_steps[genders == 0])

print(f"Male average steps: {male_avg:.2f}")
print(f"Female average steps: {female_avg:.2f}")


weights = np.array([0.4, 0.2, 0.3, 0.1])

# normalize heart rate so higher is better: invert it
norm_hr = np.max(user_avg[:, 3]) - user_avg[:, 3]

# stack metrics with adjusted HR
adjusted_metrics = user_avg.copy()
adjusted_metrics[:, 3] = norm_hr

# health score per user
health_score = np.dot(adjusted_metrics, weights)

# top 5 healthiest users
top5_health = np.argsort(health_score)[-5:][::-1]

print("Top 5 healthiest users:", top5_health)


Correlation matrix between metrics:
 [[ 1.   -0.03  0.11  0.11]
 [-0.03  1.    0.01  0.07]
 [ 0.11  0.01  1.   -0.04]
 [ 0.11  0.07 -0.04  1.  ]]
Correlation between age and steps: 0.18
Male average steps: 7340.03
Female average steps: 7381.64
Top 5 healthiest users: [ 1 15 90 61 19]


### 4. Goal Achievement:

In [172]:
# goal thresholds
goals = np.array([8000, 2000, 60])  # steps, calories, active_minutes

# select only relevant metrics
user_metrics = dataset[:, :, :3]  # shape: (100, 90, 3)

# boolean array: True if goal met, False otherwise
goal_achieved = user_metrics >= goals  
# shape: (100, 90, 3)

# fraction of days each goal is achieved per user
# axis=1 -> average across days
per_user_goal_rate = np.mean(goal_achieved, axis=1)  # shape: (100, 3)

# convert to percentage
per_user_goal_rate_pct = per_user_goal_rate * 100

print("First 5 users' goal achievement rates (%)\n", per_user_goal_rate_pct[:5])

# all goals above 80% achievement
consistent_users_mask = np.all(per_user_goal_rate >= 0.8, axis=1)

# indices of users meeting all goals
consistent_users = np.where(consistent_users_mask)[0]

print("Users consistently meeting all goals (>80% days):", consistent_users)


First 5 users' goal achievement rates (%)
 [[48.88888889 70.         68.88888889]
 [45.55555556 67.77777778 75.55555556]
 [44.44444444 73.33333333 67.77777778]
 [55.55555556 64.44444444 71.11111111]
 [50.         77.77777778 67.77777778]]
Users consistently meeting all goals (>80% days): []


## Part D: Report & Insights

### 1. Executive summary
- Key findings:
  - Most users meet daily goals only sometimes; only a few meet them regularly.
  - Steps and active minutes rise and fall together; calories follow steps but are noisier.
  - Weekly patterns are clear: activity often differs between weekdays and weekends.
  - Older users tend to have slightly fewer steps on average.
- Most surprising discovery:
  - No user hit all goals on more than 80% of days. they lack consistency 

### 2. Detailed analysis
- User behavior patterns:
  - Users fall into three groups by activity: Low, Medium, High. High users not only take more steps but also log more active minutes and calories.
  - Being active and being consistent are not the same. Some users walk a lot but vary day to day. Others walk less but keep a steady routine.
  - A small set of users score well across all metrics. They look like regularly active users, not one-time spikes.

- Time patterns and what they mean:
  - The week shows a repeatable pattern: some days are higher (usually weekdays), some lower (usually weekends). This suggests work days have lower activity and weekdays have higher activity.
  - A 7-day moving average gets reed of the daily noise and shows short-term trends. If this average rises for several weeks, engagement is improving. If it falls, we may be losing users or facing season effects.
  - Monthly averages help spot bigger changes and link them to events such as events and holidays.

- Metric links:
  - Steps and active minutes are strongly linked — both measure movement.
  - Calories link to steps but are more variable because of body differences and measurement error.
  - Heart rate needs to be adjusted (lower is often better) before combining with other metrics.

### 3. Recommendations
- For users:
  - Set small weekly goals (for example, be active 5 of 7 days).
  - Add short term goals for better results (helps with motivation).

- For the company (product ideas):
  - Use adaptive goals that adjust to recent user trends.
  - Add streaks, short challenges, and easy sharing to keep users engaged.
  - Show simple weekly insight cards that explain user trends and give tips.

- Marketing ideas:
  - Focus on middle-age, medium-activity users.
  - Use the highly consistent users as promoters.

### 4. Limitations
- Assumptions made:
  - Missing data was filled using simple methods.

- Extra data that would help:
  - Location, job type, sleep, device wear time, and notes on user goals would improve insights.

- Possible biases:
  - The synthetic process and how we fixed missing or extreme values can shape results.



