# Summarizing Rate Your Music Reviews

> *Advanced Customer Analytics*  
> *MSc in Data Science, Department of Informatics*  
> *Athens University of Economics and Business*

---

Create a second Python notebook with a function called <code>summarize()</code>. The function should accept as a parameter the path to the csv file created by the first notebook. It should then create a 1-page PDF file that includes a summary of all the reviews in the csv. The nature of the summary is entirely up to you. It can be text-based, visual-based, or a combination of both. It is also up to you to define what is important enough to be included in the summary. Focuss on creating a summary that you think would be the most informative for customers. The creation of the PDF should be done through the notebook. You can use whatever Python-based library that you want.

---

##### *Libraries*

In [14]:
import pandas as pd
from fpdf import FPDF
import numpy as np
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import tomotopy as tp
from nltk.corpus import stopwords
import re
sw=stopwords.words('english')

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
def tokenizer(doc,sw):
    """
    Tokenizes a document by removing stop words, non-alphabetic characters, and words with less than 3 characters.

    Parameters:
    - doc: The input document.
    - stop_words: List of stop words to be removed from the document.

    Returns:
    - List of tokens after processing.
    """
    # Split the document into words, convert to lowercase, and remove non-alphabetic characters
    # Filter out stop words and words with less than 3 characters
    return [word for word in [re.sub('[^a-z]','',x.lower()) for x in doc.strip().split()] if word not in sw and len(word)>2]

In [16]:
def Generate_Wordcloud_Image(df: pd.core.frame.DataFrame):
    """
    Generates a word cloud image from the 'Content' column of a DataFrame and saves it as 'Wordcloud.png'.

    Parameters:
    - df: Pandas DataFrame containing a column named 'Content'.

    Returns:
    - None
    """

    # Create a WordCloud object with specified settings
    wordcloud = WordCloud(stopwords=sw, max_font_size=50, max_words=80, background_color="white") \
                    .generate(' '.join(df['Content'].tolist()))

    # Plot the word cloud
    plt.figure(figsize=(20, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

    # Save the word cloud as an image
    wordcloud.to_file("Wordcloud.png")

In [17]:
def LDA(df: pd.core.frame.DataFrame):
    """
    Applies Latent Dirichlet Allocation (LDA) on the content in the DataFrame.

    Parameters:
    - df: Pandas DataFrame containing a column named 'Content'.

    Returns:
    - mdl: A trained MGLDAModel from the 'tomotopy' library.
    """

    # Create an MGLDAModel object with specified parameters
    mdl = tp.MGLDAModel(k_g=30, k_l=20)

    c = 0

    # Tokenize and add documents to the model
    for rating in df['Content']:
        mdl.add_doc(tokenizer(rating.split('\t')[0], sw))
        if c == 50000:
            break

    # Train the model in batches of 10 for 500 iterations
    for i in range(0, 500, 10):
        mdl.train(10)

    return mdl

In [18]:
def Summarize(path : str):
    """
    Generates a summary report based on the reviews in the specified CSV file.

    Parameters:
    - path: Path to the CSV file containing reviews.

    Outputs:
    - A PDF file named 'Review Summary.pdf'.
    """
    
    # Read the CSV file into a DataFrame
    Reviews_df = pd.read_csv(path)

    # Count and calculate percentages for each rating
    Ratings_df = Reviews_df['Rating'].value_counts().to_frame(name="RatingCount").reset_index()
    Ratings_df.rename(columns={'index': 'Rating'}, inplace=True)
    Ratings_df['Percentage'] = round((Ratings_df['RatingCount'] / np.sum(Ratings_df['RatingCount'])) * 100, 1)

    # Train LDA model on the reviews
    mdl = LDA(Reviews_df)

    # Create a PDF document
    pdf = FPDF(orientation='L', unit='mm', format='A4')

    pdf_w = 210
    pdf_h = 297

    pdf.add_page()

    pdf.set_fill_color(55, 71, 79)
    pdf.rect(0, 0, pdf_h , 33, 'F')

    pdf.set_xy(1.5,1.5)
    pdf.image(r'./Images/The Car Cover.png', w=30, h=30)

    pdf.set_xy(120,1.5)
    pdf.set_font('helvetica', 'B', 24)
    pdf.set_text_color(236,239,241)
    pdf.cell(w = 50.0, h = 33.0, align='C', txt="Arctic Monkeys - The Car", border = 0)

    pdf.set_xy(pdf_h - 20,7.5)
    pdf.image(r'./Images/rym.png', w=15, h=15)

    pdf.set_font('helvetica', 'B', 16)
    pdf.set_text_color(38,50,56)
    pdf.set_xy(22.5, 30)
    pdf.cell(w = 15.0, h = 15.0, align='C', txt = "Rating", border = 0)

    pdf.set_line_width(0.5)
    pdf.set_draw_color(55, 71, 79)
    pdf.line(65,32,65,pdf_h - 32)
    pdf.line(0,55,65,55)

    pdf.set_xy(10,42.5)
    pdf.image(r'./Images/Star.png', w=10, h=10)

    pdf.set_xy(25,42.5)
    pdf.image(r'./Images/Star.png', w=10, h=10)

    pdf.set_xy(40,42.5)
    pdf.image(r'./Images/Star.png', w=10, h=10)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_text_color(38,50,56)
    pdf.set_xy(25.5, 55)
    pdf.cell(w = 15.0, h = 15.0, align='C', txt = "Ratings Breakdown (Count, %)", border = 0)

    # five stars

    pdf.set_xy(3,70)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,70)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(21,70)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(30,70)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(39,70)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(53, 70)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
             str(Ratings_df.loc[Ratings_df['Rating'] == 5.0,:]['RatingCount'].item()) + " , " 
             + str(Ratings_df.loc[Ratings_df['Rating'] == 5.0,:]['Percentage'].item()) + " %", border = 0)

    # 4.5 stars

    pdf.set_xy(3, 84)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,84)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(21,84)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(30,84)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(39,84)
    pdf.image(r'./Images/Half - Star.png', w=3, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(51, 84)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
             str(Ratings_df.loc[Ratings_df['Rating'] == 4.5,:]['RatingCount'].item()) + " , " 
             + str(Ratings_df.loc[Ratings_df['Rating'] == 4.5,:]['Percentage'].item()) + " %", border = 0)

    # 4 stars

    pdf.set_xy(3, 98)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,98)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(21,98)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(30,98)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(46, 98)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
         str(Ratings_df.loc[Ratings_df['Rating'] == 4.0,:]['RatingCount'].item()) + " , " 
         + str(Ratings_df.loc[Ratings_df['Rating'] == 4.0,:]['Percentage'].item()) + " %", border = 0)

    # 3.5 stars

    pdf.set_xy(3, 112)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,112)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(21,112)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(30,112)
    pdf.image(r'./Images/Half - Star.png', w=3, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(44, 112)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
         str(Ratings_df.loc[Ratings_df['Rating'] == 3.5,:]['RatingCount'].item()) + " , " 
         + str(Ratings_df.loc[Ratings_df['Rating'] == 3.5,:]['Percentage'].item()) + " %", border = 0)

    # 3 stars

    pdf.set_xy(3, 126)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,126)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(21,126)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(38, 126)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
         str(Ratings_df.loc[Ratings_df['Rating'] == 3.0,:]['RatingCount'].item()) + " , " 
         + str(Ratings_df.loc[Ratings_df['Rating'] == 3.0,:]['Percentage'].item()) + " %", border = 0)

    # 2.5 stars

    pdf.set_xy(3, 140)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12, 140)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(21, 140)
    pdf.image(r'./Images/Half - Star.png', w=3, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(35, 140)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
             str(Ratings_df.loc[Ratings_df['Rating'] == 2.5,:]['RatingCount'].item()) + " , " 
             + str(Ratings_df.loc[Ratings_df['Rating'] == 2.5,:]['Percentage'].item()) + " %", border = 0)

    # 2 stars

    pdf.set_xy(3, 154)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,154)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(28, 154)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
         str(Ratings_df.loc[Ratings_df['Rating'] == 2.0,:]['RatingCount'].item()) + " , " 
         + str(Ratings_df.loc[Ratings_df['Rating'] == 2.0,:]['Percentage'].item()) + " %", border = 0)

    # 1.5 stars

    pdf.set_xy(3, 168)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_xy(12,168)
    pdf.image(r'./Images/Half - Star.png', w=3, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(24, 168)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
             str(Ratings_df.loc[Ratings_df['Rating'] == 1.5,:]['RatingCount'].item()) + " , " 
             + str(Ratings_df.loc[Ratings_df['Rating'] == 1.5,:]['Percentage'].item()) + " %", border = 0)

    # 1 star

    pdf.set_xy(3, 182)
    pdf.image(r'./Images/Star.png', w=6, h=6)

    pdf.set_font('helvetica', 'B', 12)
    pdf.set_xy(17, 182)
    pdf.cell(w = 6.0, h = 6.0, align='C', txt =  
             str(Ratings_df.loc[Ratings_df['Rating'] == 1.0,:]['RatingCount'].item()) + " , " 
             + str(Ratings_df.loc[Ratings_df['Rating'] == 1.0,:]['Percentage'].item()) + " %", border = 0)


    pdf.set_line_width(0.5)
    pdf.set_draw_color(55, 71, 79)
    pdf.line(65,((pdf_w - 33) / 2) + 33,pdf_h,((pdf_w - 33) / 2) + 33)

    pdf.set_font('helvetica', 'B', 14)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(80, 32)
    pdf.cell(w = 15.0, h = 15.0, align='C', txt = "Review Topics", border = 0)

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(117, 32)
    pdf.cell(w = 15.0, h = 15.0, align='C', txt = "(Topic - Key Words)", border = 0)

    pdf.line(65,45,147,45)
    pdf.line(147,45,147,33)

    pdf.set_xy(100, 130)
    pdf.image(r'./Images/Wordcloud.png', w=160, h=60)

    #topics

    pdf.circle(70,55,1,'DF')

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(72, 51)
    pdf.cell(w = 100.0, h = 10.0, align='L', txt = "Genre - Key Words: " + 
             ' '.join([pair[0] for pair in mdl.get_topic_words(7, top_n=15)]), border = 0)

    pdf.circle(70,65,1,'DF')

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(72, 61)
    pdf.cell(w = 100.0, h = 10.0, align='L', txt = "Tracks - Key Words: " + 
             ' '.join([pair[0] for pair in mdl.get_topic_words(10, top_n=15)]), border = 0)

    pdf.circle(70,75,1,'DF')

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(72, 71)
    pdf.cell(w = 100.0, h = 10.0, align='L', txt = "Previous Album - Key Words: " + 
             ' '.join([pair[0] for pair in mdl.get_topic_words(3, top_n=13)]), border = 0)

    pdf.circle(70,85,1,'DF')

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(72, 81)
    pdf.cell(w = 100.0, h = 10.0, align='L', txt = "David Bowie - Key Words: " + 
             ' '.join([pair[0] for pair in mdl.get_topic_words(13, top_n=13)]), border = 0)

    pdf.circle(70,95,1,'DF')

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(72, 91)
    pdf.cell(w = 100.0, h = 10.0, align='L', txt = "Emotions - Key Words: " + 
             ' '.join([pair[0] for pair in mdl.get_topic_words(14, top_n=12)]), border = 0)

    pdf.circle(70,105,1,'DF')

    pdf.set_font('helvetica', 'B', 11)
    pdf.set_text_color(55, 71, 79)
    pdf.set_xy(72, 101)
    pdf.cell(w = 100.0, h = 10.0, align='L', txt = "Production - Key Words: " + 
             ' '.join([pair[0] for pair in mdl.get_topic_words(6, top_n=11)]), border = 0)

    # Save the PDF file
    pdf.output('Review Summary.pdf','F')

In [19]:
Summarize('./Data/The Car Reviews.csv')