In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


#Read csv with movie ratings into a pandas df:
movie_rating_data_df = pd.read_csv("/home/lv24252260/Documents/Assignment_7/movieratings.csv")

#Check the df
movie_rating_data_df.head()

Unnamed: 0,User,American Sniper,Edge of Tomorrow,Groundhog Day,Jurassic World,Lost in Translation,Lucy
0,Laurie,,3,4,4.0,5.0,2.0
1,Lisa,4.0,2,3,,5.0,3.0
2,Hope,,3,5,4.0,4.0,
3,Lucas,3.0,5,4,5.0,,3.0
4,Joe,4.0,4,3,4.0,,


In [13]:
#Average ratings for each user
ratings_sorted_df = movie_rating_data_df.copy()
ratings_sorted_df['User Average'] = ratings_sorted_df.mean(numeric_only = True, axis = 1)
ratings_sorted_df

Unnamed: 0,User,American Sniper,Edge of Tomorrow,Groundhog Day,Jurassic World,Lost in Translation,Lucy,User Average
0,Laurie,,3,4,4.0,5.0,2.0,3.6
1,Lisa,4.0,2,3,,5.0,3.0,3.4
2,Hope,,3,5,4.0,4.0,,4.0
3,Lucas,3.0,5,4,5.0,,3.0,4.0
4,Joe,4.0,4,3,4.0,,,3.75


In [14]:
#Average ratings for each movie
movie_average_ratings_df = ratings_sorted_df.copy()
movie_average_ratings_df.loc['Movie Average'] = movie_average_ratings_df.mean(skipna=True, numeric_only=True)
movie_average_ratings_df

Unnamed: 0,User,American Sniper,Edge of Tomorrow,Groundhog Day,Jurassic World,Lost in Translation,Lucy,User Average
0,Laurie,,3.0,4.0,4.0,5.0,2.0,3.6
1,Lisa,4.0,2.0,3.0,,5.0,3.0,3.4
2,Hope,,3.0,5.0,4.0,4.0,,4.0
3,Lucas,3.0,5.0,4.0,5.0,,3.0,4.0
4,Joe,4.0,4.0,3.0,4.0,,,3.75
Movie Average,,3.666667,3.4,3.8,4.25,4.666667,2.666667,3.75


### Normalized vs. Standardized Ratings

Normalizing data typically means to scale it to a specific range of [0,1] 

Standardizing typically means rescaling data to have a mean of 0 and a standard deviation of 1.

Normalizing or standardizing can have advantages and disadvantages depending on the goals and context of the analysis being performed.

Advantages of normalized ratings are that it allows for fair comparisons, since it puts all ratings on the same scale. It can simplify interpretation of data and help mitigate the efects of outliers or extreme values.

Disadvantages of normalization are that it can distort the original ratings by rescaling them, such as the magnitude, which may be important depending on the context.

In [15]:
#Normalized User Ratings

# Create a copy of the original dataframe
ratings_normalized_df = ratings_sorted_df.copy()

# Normalize the ratings for each user by subtracting the minimum and dividing by the range of values
ratings_normalized_df.iloc[:,1:] = (ratings_normalized_df.iloc[:,1:] - ratings_normalized_df.iloc[:,1:].min()) / (ratings_normalized_df.iloc[:,1:].max() - ratings_normalized_df.iloc[:,1:].min())

# View the normalized dataframe
ratings_normalized_df

Unnamed: 0,User,American Sniper,Edge of Tomorrow,Groundhog Day,Jurassic World,Lost in Translation,Lucy,User Average
0,Laurie,,0.333333,0.5,0.0,1.0,0.0,0.333333
1,Lisa,1.0,0.0,0.0,,1.0,1.0,0.0
2,Hope,,0.333333,1.0,0.0,0.0,,1.0
3,Lucas,0.0,1.0,0.5,1.0,,1.0,1.0
4,Joe,1.0,0.666667,0.0,0.0,,,0.583333


In [16]:
#Normalized Movie Ratings

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the ratings data and transform it
ratings_standardized = scaler.fit_transform(ratings_sorted_df.iloc[:, 1:])

# Create a new dataframe with the standardized ratings and user names
ratings_standardized_df = pd.DataFrame(data=ratings_standardized, index=ratings_sorted_df.iloc[:,0], columns=ratings_sorted_df.select_dtypes(include=np.number).columns)

# Compute the mean of each standardized column
movie_average_ratings_standardized = ratings_standardized_df.mean()

# View the standardized average ratings for all movies
print(movie_average_ratings_standardized)


American Sniper        2.960595e-16
Edge of Tomorrow       6.661338e-17
Groundhog Day          2.220446e-16
Jurassic World        -2.775558e-17
Lost in Translation   -5.921189e-16
Lucy                   2.960595e-16
User Average           0.000000e+00
dtype: float64
