In [16]:
import numpy as np
import pandas as pd
import altair as alt

def generate_dataset(num_lists=30, min_len=5, max_len=35, min_val=1, max_val=20, trend_start=4, trend_end=19):
    # Initialize an empty dataset
    dataset = []
    
    # Generate a linear trend from trend_start to trend_end with some noise
    trend = np.linspace(trend_start, trend_end, num=num_lists)
    noise = np.random.normal(0, 2, num_lists)  # Adding noise with a small standard deviation
    noisy_trend = trend + noise
    
    # Ensure values stay within the min and max bounds
    noisy_trend = np.clip(noisy_trend, min_val, max_val)
    
    # Generate the lists of ints with lengths varying between min_len and max_len
    for avg in noisy_trend:
        list_len = np.random.randint(min_len, max_len + 1)
        int_list = np.random.randint(min_val, max_val + 1, size=list_len)
        
        # Adjust the list so that its average is around the desired avg
        adjustment_factor = avg / np.mean(int_list)
        adjusted_list = np.clip(np.round(int_list * adjustment_factor), min_val, max_val).astype(int)
        
        dataset.append(adjusted_list.tolist())
    
    return dataset

# Example usage
my_dataset = generate_dataset()
print(my_dataset[:5])  # Prints the first 5 lists of the generated dataset
len(my_dataset)

[[3, 1, 2, 8, 7, 9, 4, 1, 8, 9, 8, 8, 6, 2, 7, 7, 6, 4, 5, 5], [7, 8, 1, 4, 1, 1, 4, 5, 4, 6, 5, 4, 2, 2, 6, 8, 4, 2, 4, 2, 1, 6, 8, 8, 2, 6, 4, 5, 2, 4, 8, 4, 5], [5, 7, 4, 7, 3, 2, 6, 2, 6, 3, 1, 4, 2, 2, 7, 8, 2, 2, 7], [8, 6, 3, 7, 5, 1, 3, 1, 1, 7, 2, 3, 2, 8, 1, 6, 5, 9, 7, 6, 6, 3, 7, 5, 2, 3, 9, 5, 9, 7, 8, 3], [11, 2, 7, 10, 5, 7, 6, 9, 11, 8, 6, 8, 1, 6, 11, 10, 5, 7, 5, 3, 5, 10, 1, 6, 5, 4, 3, 1, 4, 2, 6, 11, 6, 9, 3]]


30

In [33]:
import altair as alt
import pandas as pd

# Prepare the data for Altair
data = []
for day, values in enumerate(my_dataset):
    for value in values:
        data.append({"Day": day + 1, "Value": value})
data_df = pd.DataFrame(data)

# Calculate the daily averages
daily_avg = data_df.groupby('Day')['Value'].mean().reset_index()

# Create line chart showing the average trend
line = alt.Chart(daily_avg).mark_line(color='#8023fa', strokeWidth=3).encode(
    x=alt.X('Day:O', title='Day', axis=alt.Axis(labelFontSize=12, titleFontSize=14)),
    y=alt.Y('Value:Q', title='Discussion Complexity', axis=alt.Axis(labelFontSize=12, titleFontSize=14)),
    tooltip=['Day', 'Value']
).properties(
    title='Average Discussion Complexity Over A Month',
    width=600,
    height=400
).configure_title(
    fontSize=16,        # Set the title font size
    font='Arial',
    anchor='middle',    # Center the title
    color='black'
).configure_axis(
    grid=True,          # Add gridlines
    gridColor='lightgray',
    gridDash=[5, 5],    # Dashed gridlines
    tickCount=5,       # Set the number of ticks on the axes
)

# Display the chart
line.display()
line.save('average_discussion_complexity.png')
