# Weighted Average Rating Calculation

# Business Problem

In this section, we will rank the datasets by their weighted average ratings.

# Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

# Import Dataset

In [None]:
df = pd.read_csv("course_reviews.csv")
df.head()

# Genereal Information About Dataset

In [None]:
def check_df(dataframe, head=5):
    print(20*"#", "Head", 20*"#")
    print(dataframe.head(head))
    print(20*"#", "Tail", 20*"#")
    print(dataframe.tail(head))
    print(20*"#", "Shape", 20*"#")
    print(dataframe.shape)
    print(20*"#", "Type", 20*"#")
    print(dataframe.dtypes)
    print(20*"#", "NA", 20*"#")
    print(dataframe.isnull().sum())
    print(20*"#", "Quartiles", 20*"#")
    print(dataframe.describe([0, 0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

In [None]:
df.info()

In [None]:
df.groupby("Questions Asked").agg({"Questions Asked":"count",
                                 "Rating": "mean"})

In [None]:
df.head()

# Data Preprocessing

In [None]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

In [None]:
df["Timestamp"].max()

In [None]:
current_date = pd.to_datetime("2021-02-10 0 0:00:00")

In [None]:
df["days"] = (current_date - df["Timestamp"]).dt.days

In [None]:
df.head()

# Time based average

In [None]:
df.loc[df["days"]<=30, "Rating"].mean() * 28/100 + \
df.loc[(df["days"]>30) & (df["days"]<=90), "Rating"].mean() * 26/100 + \
df.loc[(df["days"]>90) & (df["days"]<=180), "Rating"].mean() * 24/100 + \
df.loc[df["days"]>180, "Rating"].mean() * 22/100

In [None]:
def time_based_weighted_average(dataframe, w1=28, w2=26, w3=24, w4=22):
    return dataframe.loc[dataframe["days"]<=30, "Rating"].mean() * w1/100 + \
    dataframe.loc[(dataframe["days"]>30) & (dataframe["days"]<=90), "Rating"].mean() * w2/100 + \
    dataframe.loc[(dataframe["days"]>90) & (dataframe["days"]<=180), "Rating"].mean() * w3/100 + \
    dataframe.loc[dataframe["days"]>180, "Rating"].mean() * w4/100

In [None]:
time_based_weighted_average(df)

In [None]:
def time_based_weighted_average(dataframe, day_column="days", rating_column="Rating", w1=28, w2=26, w3=24, w4=22):
    #preprocessing
    dataframe["Timestamp"] = pd.to_datetime(dataframe["Timestamp"])
    current_date = pd.to_datetime("2021-02-10 00:00:00")
    dataframe[day_column] = (current_date - dataframe["Timestamp"]).dt.days
    
    # time based weighted average
    return dataframe.loc[dataframe[day_column]<=30, rating_column].mean() * w1/100 + \
    dataframe.loc[(dataframe[day_column]>30) & (dataframe[day_column]<=90), rating_column].mean() * w2/100 + \
    dataframe.loc[(dataframe[day_column]>90) & (dataframe[day_column]<=180), rating_column].mean() * w3/100 + \
    dataframe.loc[dataframe[day_column]>180, rating_column].mean() * w4/100

In [None]:
time_based_weighted_average(df)

In [None]:
time_based_weighted_average(df, w1=30, w2=30, w3=20, w4=20)

# User based average


user_based_weighted_average = sum(similarity user * user preference) / sum(similarity)

In [None]:
df.groupby("Progress").agg({"Rating":"mean"})

In [None]:
df.loc[df["Progress"]<=10, "Rating"].mean() * 22/100 +\
df.loc[(df["Progress"]>10) & (df["Progress"]<=45), "Rating"].mean() *24/100 +\
df.loc[(df["Progress"]>45) & (df["Progress"]<=75), "Rating"].mean() * 26/100 +\
df.loc[df["Progress"]>75, "Rating"].mean() *28/100

In [None]:
def user_based_weighted_average(dataframe, rating_column="Rating", progress_column="Progress", w1=22, w2=24, w3=26, w4=28):
    return dataframe.loc[dataframe[progress_column]<=10, rating_column].mean() * w1/100 +\
    dataframe.loc[(dataframe[progress_column]>10) & (dataframe[progress_column]<=45), rating_column].mean() *w2/100 +\
    dataframe.loc[(dataframe[progress_column]>45) & (dataframe[progress_column]<=75), rating_column].mean() * w3/100 +\
    dataframe.loc[dataframe[progress_column]>75, rating_column].mean() *w4/100

In [None]:
user_based_weighted_average(df)

In [None]:
time_based_weighted_average(df) * 40/100 + user_based_weighted_average(df) * 60/100

# Course weighted rating

In [None]:
def course_weighted_rating(dataframe, time_w=40, user_w=60):
    return time_based_weighted_average(dataframe) * time_w/100 + user_based_weighted_average(dataframe) * user_w/100

In [None]:
course_weighted_rating(df)

# Pipeline

In [None]:
def pipeline_average(dataframe, time_w=40, user_w=60, day_column="days", progress_column="Progress", rating_column="Rating", tw1=28, tw2=26, tw3=24, tw4=22,  uw1=22, uw2=24, uw3=26, uw4=28):
        #preprocessing
    dataframe["Timestamp"] = pd.to_datetime(dataframe["Timestamp"])
    current_date = pd.to_datetime("2021-02-10 00:00:00")
    dataframe[day_column] = (current_date - dataframe["Timestamp"]).dt.days
    
    # time based weighted average
    time_based_avg = dataframe.loc[dataframe[day_column]<=30, rating_column].mean() * tw1/100 + \
    dataframe.loc[(dataframe[day_column]>30) & (dataframe[day_column]<=90), rating_column].mean() * tw2/100 + \
    dataframe.loc[(dataframe[day_column]>90) & (dataframe[day_column]<=180), rating_column].mean() * tw3/100 + \
    dataframe.loc[dataframe[day_column]>180, rating_column].mean() * tw4/100
    
    # user based weighted average
    user_based_average = dataframe.loc[dataframe[progress_column]<=10, rating_column].mean() * uw1/100 +\
    dataframe.loc[(dataframe[progress_column]>10) & (dataframe[progress_column]<=45), rating_column].mean() * uw2/100 +\
    dataframe.loc[(dataframe[progress_column]>45) & (dataframe[progress_column]<=75), rating_column].mean() * uw3/100 +\
    dataframe.loc[dataframe[progress_column]>75, rating_column].mean() * uw4/100
    
    # return course weighted rating
    return time_based_avg * time_w/100 + user_based_average * user_w/100

In [None]:
pipeline_average(df)