# Case Study: DataCamp Courses

In [1]:
import pandas as pd
from sqlalchemy import create_engine
db_engine = create_engine('postgresql+psycopg2://adrik:root1234@localhost:5432/adrik')

### Extract

In [13]:
def extract_table_to_pandas(tablename, db_engine):
    query = "SELECT * FROM {}".format(tablename)
    return pd.read_sql(query, db_engine)

In [15]:
course = extract_table_to_pandas("courses", db_engine)
course.head()

Unnamed: 0,course_id,title,description,programming_language
0,1,Machine Learning with Apache Spark,"Spark is a powerful, general purpose tool for ...",python
1,2,Financial Analytics in Spreadsheets,Monitoring the evolution of traded assets is k...,spreadsheets
2,3,Intermediate R,The intermediate R course is the logical next ...,r
3,4,Data Visualization with ggplot2 (Part 2),This ggplot2 tutorial builds on your knowledge...,r
4,5,Fraud Detection in R,The Association of Certified Fraud Examiners e...,r


In [16]:
rating = extract_table_to_pandas("rating", db_engine)
rating.head()

Unnamed: 0,user_id,course_id,rating
0,1,6,4
1,1,36,5
2,1,37,5
3,1,45,5
4,1,50,5


### Transform 

In [17]:
# Complete the transformation function
def transform_avg_rating(rating):
    # Group by course_id and extract average rating per course
    avg_rating = rating.groupby('course_id')['rating'].mean()
    # Return sorted average ratings per course
    sort_rating = avg_rating.sort_values(ascending=False).reset_index()
    return sort_rating

In [18]:
avg_rating_data = transform_avg_rating(rating)
avg_rating_data.head()

Unnamed: 0,course_id,rating
0,46,4.8
1,23,4.8
2,96,4.692765
3,56,4.661765
4,24,4.653061


In [19]:
course.isna().sum()

course_id               0
title                   0
description             0
programming_language    3
dtype: int64

In [22]:
# The transformation should fill in the missing values
def transform_fill_programming_language(course):
    imputed = course.fillna({"programming_language": "R"})
    return imputed

transformed = transform_fill_programming_language(course)
transformed.isna().sum()

course_id               0
title                   0
description             0
programming_language    0
dtype: int64

In [30]:
courses_to_recommend = rating[['course_id', 'user_id']]
courses_to_recommend.head()

Unnamed: 0,course_id,user_id
0,6,1
1,36,1
2,37,1
3,45,1
4,50,1


In [31]:
def transform_recommendations(avg_rating_data, courses_to_recommend):
    # Merge both DataFrames
    merged = courses_to_recommend.merge(avg_rating_data, how='inner', on='course_id') 
    # Sort values by rating and group by user_id
    grouped = merged.sort_values("rating", ascending=False).groupby("user_id")
    # Produce the top 3 values and sort by user_id
    recommendations = grouped.head(3).sort_values("user_id").reset_index()
    final_recommendations = recommendations[["user_id", "course_id","rating"]]
    # Return final recommendations
    return final_recommendations

# Use the function with the predefined DataFrame objects
recommendations = transform_recommendations(avg_rating_data, courses_to_recommend)

In [32]:
recommendations.head()

Unnamed: 0,user_id,course_id,rating
0,1,37,4.581818
1,1,36,4.54619
2,1,81,4.621339
3,2,50,4.456914
4,2,81,4.621339


### Load

In [33]:
def load_to_dwh(recommendations):
    recommendations.to_sql("recommendations", db_engine, if_exists="replace") # Can use also append

In [None]:
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
# Define the DAG so it runs on a daily basis
dag = DAG(dag_id="recommendations",
          schedule_interval="0 0 * * *")

# Make sure `etl()` is called in the operator. Pass the correct kwargs.
task_recommendations = PythonOperator(
    task_id="recommendations_task",
    python_callable=etl,
    op_kwargs={"db_engines": db_engines},
)

In [None]:
def recommendations_for_user(user_id, threshold=4.5):
    # Join with the courses table
    query = """
    SELECT title, rating FROM recommendations
    INNER JOIN courses ON courses.course_id = recommendations.course_id
    WHERE user_id=%(user_id)s AND rating>%(threshold)s
    ORDER BY rating DESC
    """
    # Add the threshold parameter
    predictions_df = pd.read_sql(query, db_engine, params = {"user_id": user_id, 
                                                           "threshold": threshold})
    return predictions_df.title.values

# Try the function you created
recommendations_for_user(12, 4.65)