# Staff-Graded Matrix Completion

### Part(a)

In [87]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Import data as pandas df, convert to np.array
data = np.asarray(pd.read_csv('MovieReviewMat.csv', header=None))
titles = data[0,1:] # Save movie titles
data = data[1:,1:] # Save all rows except movie titles

# Replace string 'NA' and blank with np.nan
data[data == 'NA'] = np.nan
data[data == ''] = np.nan

# Remove header rows and column, and ensure all values are float type
matrix = data[1:,:].astype(float)

# Find values
max_val = np.nanmax(matrix)
min_val = np.nanmin(matrix)
avg_val = np.nanmean(matrix)

print(f'Maximum Value: {max_val}')
print(f'Minimum Value: {min_val}')
print(f'Average Value: {avg_val}')


Maximum Value: 5.0
Minimum Value: 0.5
Average Value: 3.297080189685473


The maximum rating in the data is 5.0, and the minimum rating is 0.5

The average rating is approximately 3.3

### Part (b)

In [88]:
# Find genre labels and scores for individual
headers = data[0,1:]
scores_1460 = matrix[1460,:]
scores_43 = matrix[43,:]

def fivestar_review_count(headers, scores) -> list:
    # Concatenate all genre labels into one string
    string = str()
    for cell in headers:
        string += cell + '|'

    # Create function finds indeces of pipe character in concatenated string,
    # returns indeces in a list
    def find_letter(string, char):
        return [i for i, letter in enumerate(string) if letter == char]

    # Use function to find indeces list
    indeces = find_letter(string, '|')

    # Initialize empty list
    genre_list = list()

    # Fill empty list with non-unique genre names
    for i in range(len(indeces)):
        if i > 0:
            genre_list.append(string[indeces[i-1]+1:indeces[i]])
        else:
            genre_list.append(string[:indeces[i]])
                            
    # Unique-ify the genre list
    unique_genres = set(genre_list)
    unique_genres = list(unique_genres)
    unique_genres.sort()

    # Create dictionary of unique genre names and placeholder zero counts
    reviews_dict = {'Genre': unique_genres,
                    'Score': list(np.repeat(0, len(unique_genres)))}
    # Convert dictionary to data frame
    fivestar_reviews = pd.DataFrame(reviews_dict)

    # Loop through headers and unique genres to find the five-star review count
    # for each unique genre
    name_pair = list()
    i = 0
    for genre in fivestar_reviews['Genre']:
        j = 0
        for genre_combo in headers:
            if genre in genre_combo and scores[j] == 5:
                fivestar_reviews['Score'][i] += 1
                if len(name_pair) < 2:
                    name_pair.append(titles[j])
            j += 1
        i += 1

    # Sort by score (desc) and return
    fivestar_reviews = fivestar_reviews.sort_values(by=['Score'], ascending=False)
    return [fivestar_reviews, name_pair]


print(f'Five-star reviews for individual indexed at 1460, by genre:\n {fivestar_review_count(headers, scores_1460)[0]}')
print(f'\nTwo movies rated five stars by individual indexed at 1460:\n{fivestar_review_count(headers, scores_1460)[1]}')
print('\n\n')
print(f'Five-star reviews for individual indexed at 43, by genre:\n {fivestar_review_count(headers, scores_43)[0]}')
print(f'\nTwo movies rated five stars by individual indexed at 43:\n{fivestar_review_count(headers, scores_43)[1]}')

Five-star reviews for individual indexed at 1460, by genre:
                  Genre  Score
8                Drama      6
5               Comedy      5
2            Adventure      3
6                Crime      3
1               Action      3
15             Romance      2
19             Western      1
18                 War      1
17            Thriller      1
16              Sci-Fi      1
9              Fantasy      1
11              Horror      1
14             Mystery      0
0   (no genres listed)      0
13             Musical      0
12                IMAX      0
7          Documentary      0
4             Children      0
3            Animation      0
10           Film-Noir      0

Two movies rated five stars by individual indexed at 1460:
['Taxi Driver (1976)', 'Rob Roy (1995)']



Five-star reviews for individual indexed at 43, by genre:
                  Genre  Score
8                Drama      5
19             Western      1
18                 War      1
5               Comedy    

The first table above, which counts only five-star reviews, shows that indvidual 1460 tends to give the most five-star reviews to movies in the Drama genre. Two movies that this individual rates with a 5 are *Taxi Driver* and *Rob Roy*.

The second table shows that individual 43 gives the most five-star reviews to movies in the Drama genre, similar to the other individual. However, individual 43 seems to have given far fewer five-star reviews than individual 1460. Two movies that this individual rates with a 5 are *The Sting* and *Jack*.

### Part (c)

I estimate that we will find two broad classes of individuals in the rating data: Those who give five-star reviews to many different movies, and those who give five-star reviews to only one or two genres of movies.

This phenomenon could be explained by the proclivity of some people to rather freely give a five-star rating to a film. Additionally, some people are movie enthusiasts, tending to watch more movies than the average individual and thus having more opportunity to give any review at all.

### Part (d)

In [None]:
k = 2
