In [14]:
import pandas as pd
import numpy as np
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=unames, engine='python')
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rnames,  engine='python')
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=mnames,  engine='python')
data = pd.merge(pd.merge(ratings, users), movies)

# Exercises: 

**MovieLens database**

1- Filter  films that have received at least 250 ratings. **Use only these films in the rest of exercices**

In [15]:
def ex1():
    #Lets group each movie in one row, and for each one of them, count the number or ratings
    group = data[['title','rating']].groupby('title').count()
    #Now that I have the number of ratings of each movie, get only the ones who are rated more thatn 250 times
    return group[group['rating']>=250].sort_values(by="rating", ascending= False)
#This is my own function that allows me to return a subset of my original dataset with only the movies that are rated more than 250 times
def getBest250():
    #To get this subset I do an "inner join" on the "title" column
    return pd.merge(left=data,right=ex1().drop(["rating"],axis=1),left_on="title",right_on="title")

2- Obtain the mean ratings for each movie grouped by gender that have at least 250 ratings. 

In [16]:
def ex2():
    #Get only the movies with at least 250 ratings
    selected = getBest250()
    #Pivot this dataset so now the index are each movie title and for each one of them give me the mean rating divided by gender
    piv = pd.pivot_table(selected,values="rating",index="title",columns="gender",aggfunc="mean")
    #Only For the .csv exportion, I delete the indexes
    piv.reset_index()
    return piv

3- Show films more valued by women.

In [17]:
def ex3():
    #They are asking me the same question as in ex2() but only for the women, so I delete the Male column and sort the Female column
    selected = ex2()
    return selected.drop(['M'],axis=1).sort_values(by="F", ascending= False)
    

4- Now we wonder which movies are rated more differently between men and women. Which films have more different rating and are more highly valued by women? 

In [18]:
def ex4():
    #I take the same dataset as ex2() so I can substract both Female and Male ratings, getting the difference
    meansByGender = ex2()
    #Best movies valued by womens
    meansByGender["diff"] = meansByGender["M"]-meansByGender["F"]
    return meansByGender.sort_values(by="diff")
    

5- And which films have more different rating and are more highly valued by men? 

In [19]:
def ex5():
    #Repeat the ex4() exercice but reverse the diff column
    return ex4().sort_values(by="diff",ascending=False)

6- What are the films that have generated the most discordant ratings, regardless of gender?

In [20]:
def ex6():
    #I understand discrodant as computing the Standard Deviation of my ratings grouped by movie. I could also do the variance, wich would give me the same order.
    return getBest250()[['title','rating']].groupby('title').agg("std").sort_values(by="rating",ascending=False)

7- What is the highest rated movie in average?

In [21]:
def ex7():
    #Compute the mean rating for each movie (grouping by title) and then  sort them descending. Get only the first one: [:1]
    return getBest250()[['title','rating']].groupby('title').agg("mean").sort_values(by="rating",ascending=False)[:1]

*** From here use all the ratings ***

8- Calculate the average rating of each user. 

In [22]:
def ex8():
    #Pivot the table so my index is every user_id. Get each rating of each user and compute the mean.
    return data.pivot_table(index='user_id',values='rating',aggfunc='mean')

9- Define a function called  <b>top_movies</b> that given a df, a user and a number n it returns what movies have the top n rank for this user.

def top_movies(data,user,n)


In [23]:
def top_movies(data_,usr,n_rows = 10):
    #Get the dataframe of the user requested: data[data['user_id'] == usr]
    #Get the 3 columns we are interested in: [['movie_id','title','rating']]
    #Sort by ratings, aka "ranking", descending order: .sort_values(by='rating',ascending=False)
    #Show only the requested number of rows: [:n_rows]
    return data[data['user_id'] == usr][['title','rating']].sort_values(by='rating',ascending=False)[:n_rows]
def ex9():
    return top_movies(data,1,30)


** Data from CSV**

10- Read data from csv file: `ma-ba.csv`. Count the number of times `Barça` wins `Madrid` and compute the stadistics of % win, % lose and % draw.

In [24]:
def ex10():
    #Read the df with no index because it already has a match number
    df = pd.read_csv('ma-ba.csv',engine='python',index_col=None)
    #Perform some data cleaning, including more columns to perform some operations later on
    df = dataCleaning(df)
    #Create a new column with the winner of each match 
    df = selectWinner(df)
    #Now that we have the winner of each match, count each team stats and return it as a DataFrame
    return countData(df)

def dataCleaning(df):
    #Clean the column names with no extra spaces
    rightNames = ["N","Fecha","Lugar","Partidos","resultado","Competicion"]
    df.rename(columns=dict(zip(df.columns,rightNames)),inplace=True)
    #Create two extra columns by splitting the result, aka spliting by "-". So 3-4 will be [3] and [4]
    df[["golesLocal","golesVisitante"]] = df['resultado'].str.split("-",expand=True)
    #Create two extra columns by splitting the rivals name, so "Barcelona-Madrid" will be [Barcelona] and [Madrid] 
    df[["local","visitante"]] = df['Partidos'].str.split("-",expand=True)
    
    return df

def selectWinner(df):
    #Notate the win, draw and lose conditions
    conditions = [(df["golesLocal"]>df["golesVisitante"]) & (df["golesVisitante"] != df["golesLocal"]),
              (df["golesVisitante"]>df["golesLocal"]) & (df["golesVisitante"] != df["golesLocal"]),
              df["golesLocal"] == df["golesVisitante"]]
    #Notate the result name of this 3 conditions
    #The .str.strip() removes white spaces that annoy us because we need to count wins of this 2 unique teams.
    choices = [df["local"].str.strip(),df["visitante"].str.strip(),"Empate"]
    #Select the right choice of every row on a new column named "Ganador"
    df["Ganador"] = np.select(conditions,choices,default=np.nan)
    #More data cleaning because the Madrid team has different names and we need to count the wins with one unique name: R. Madrid
    df["Ganador"] = np.where((df["Ganador"]!="Barcelona") & (df["Ganador"]!="Empate"),"R. Madrid",df["Ganador"])
    
    return df

def countData(df):
    #Count the unique values of this new column "Ganador", meaning we get the number of wins of each teams and tthe amount of times the match resulted in a draw
    series = df["Ganador"].value_counts()
    BWins = series['Barcelona']
    MWins = series["R. Madrid"]
    Draws = series["Empate"]
    #Number of matches is the number of rows
    totalMatches = df.shape[0]
    
    return pd.DataFrame({'Local':["Barcelona","R. Madrid"],
                           'Win':[BWins,MWins],
                           'Lose':[MWins,BWins],
                           'Draw':[Draws,Draws],
                           '% Wins':[BWins/totalMatches,MWins/totalMatches],
                           '% Lose':[MWins/totalMatches,BWins/totalMatches],
                           '% Draw':[Draws/totalMatches,Draws/totalMatches]})


In [13]:
n = 11
for i in range(1,n):
    try:
        df = globals()["ex"+str(i)]()
        df.to_csv(str(i)+"as.csv",header=False)
    except Exception as e: 
        print(i,repr(e))
        open(str(i)+".csv","a").close()

