In [1]:
import numpy as np
import pandas as pd

### A. Write down the MAB agent problem formulation in your own words.

- The MAB agent needs to decide which ad to display at each time step to maximize the total number of clicks received over a large number of time steps.
- The agent's objective is to maximize the total reward obtained by displaying ads to users.
- There are multiple ads to choose, each with an unknown click-through distribution that determines the probability of user clicking on the ad.
- The agent must balance the tradeoff between exploration and exploitation to maximize the total reward obtained, taking into account the cost of displaying ads that are less likely to be clicked.
- Various Algorithms are used to solve MAB problem
  - eplison-greedy
  - Upper-Confidence-Bound method


### B.Compute the total rewards after 2000-time steps using the ε-greedy action. for 
- ε=0.01, 
- ε= 0.3

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np


df = pd.read_csv("/content/drive/MyDrive/Artificial_Intelligence/Week-6/Ads_Optimisation.csv")
num_ads = 10
num_rounds = 2000

ads_selected = []
total_rewards = 0
num_selections = np.zeros(num_ads)
sum_rewards = np.zeros(num_ads)

epsilons = [0.01, 0.3]

for eps in epsilons:
    for n in range(num_rounds):

        if np.random.random() > eps:   
            ad = np.argmax(sum_rewards / (num_selections + 1))    # random ad reward selection
        else:
            ad = np.random.randint(num_ads)     # random selection

        ads_selected.append(ad)
        reward = df.values[n, ad]
        total_rewards += reward
        num_selections[ad] += 1
        sum_rewards[ad] += reward
        if n % 100 == 0:
          print(f"Selected ad for round {n}: {ad} (Epsilon={eps})")
    
    print("------------------------------------------------------------------")
    print(f"Total reward for epsilon={eps}: {total_rewards}")
    import statistics
    most_frequent = statistics.mode(ads_selected)
    print(most_frequent)
    max_val = max(ads_selected)
    print(f"Ad selected in epsilon method is:= {max_val}")
    print("_________________________________________________________________________________________________________________________________")
    


Selected ad for round 0: 0 (Epsilon=0.01)
Selected ad for round 100: 0 (Epsilon=0.01)
Selected ad for round 200: 0 (Epsilon=0.01)
Selected ad for round 300: 0 (Epsilon=0.01)
Selected ad for round 400: 0 (Epsilon=0.01)
Selected ad for round 500: 0 (Epsilon=0.01)
Selected ad for round 600: 0 (Epsilon=0.01)
Selected ad for round 700: 0 (Epsilon=0.01)
Selected ad for round 800: 0 (Epsilon=0.01)
Selected ad for round 900: 0 (Epsilon=0.01)
Selected ad for round 1000: 0 (Epsilon=0.01)
Selected ad for round 1100: 0 (Epsilon=0.01)
Selected ad for round 1200: 0 (Epsilon=0.01)
Selected ad for round 1300: 0 (Epsilon=0.01)
Selected ad for round 1400: 0 (Epsilon=0.01)
Selected ad for round 1500: 0 (Epsilon=0.01)
Selected ad for round 1600: 0 (Epsilon=0.01)
Selected ad for round 1700: 0 (Epsilon=0.01)
Selected ad for round 1800: 0 (Epsilon=0.01)
Selected ad for round 1900: 0 (Epsilon=0.01)
------------------------------------------------------------------
Total reward for epsilon=0.01: 337
0
Ad selec

### C. Compute the total rewards after 2000-time steps using the Upper-Confidence-Bound action method for c= 1.5

In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/Artificial_Intelligence/Week-6/Ads_Optimisation.csv")
num_ads = 10
num_rounds = 2000

ads_selected = []
total_rewards = 0
num_selections = np.zeros(num_ads)
sum_rewards = np.zeros(num_ads)

# exploration parameter c
c = 1.5

for n in range(num_rounds):
    # select ad using UCB action method
    ad = 0
    max_upper_bound = 0
    for i in range(num_ads):
        if num_selections[i] > 0:
            average_reward = sum_rewards[i] / num_selections[i]
            delta_i = np.sqrt(2 * np.log(n+1) / num_selections[i])
            upper_bound = average_reward + c * delta_i
        else:
            upper_bound = 1e400
        if upper_bound > max_upper_bound:
            max_upper_bound = upper_bound
            ad = i

    ads_selected.append(ad)

    reward = df.values[n, ad]
    total_rewards += reward
    num_selections[ad] += 1
    sum_rewards[ad] += reward

print(f"Total reward for c={c}: {total_rewards}")
import statistics

most_frequent = statistics.mode(ads_selected)
print(most_frequent)

print(f"Ad selected in UCB method is:= {max_val}")

Total reward for c=1.5: 290
4
Ad selected in UCB method is:= 9


### D. For all approaches, explain how the action value estimated compares to the optimal action

#### For the epsilon-greedy method:

- The estimated action values improve as the number of rounds increases
- As the value of epsilon decreases, the algorithm becomes more greedy and relies more on the estimated action values
- For epsilon=0.01, the algorithm selects the optimal action reward was 502, indicating that it may not have explored enough to estimate the true action values accurately
- For epsilon=0.3, the algorithm selects the optimal action reward was 946, indicating that it may have explored enough to estimate the true action values accurately

#### For the UCB method:

- The estimated action values improve as the number of rounds increases
- The algorithm tends to select the optimal action more often compared to the epsilon-greedy method
- For c=1.5, the algorithm selects the optimal action reward was 290, indicating that it may have explored enough to estimate the true action values accurately

### The UCB approach had a total reward of 290, which was lower than the total reward for the epsilon-greedy approach with ε=0.3, which was 729. This suggests that the epsilon-greedy approach with ε=0.3 may have performed better than the UCB approach.