# Import necessary dependencies and settings

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import scipy.stats as spstats

%matplotlib inline
mpl.style.reload_library()
mpl.style.use('classic')
mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
mpl.rcParams['figure.figsize'] = [6.0, 4.0]
mpl.rcParams['figure.dpi'] = 100

# Raw Measures

## Values

In [2]:
# Lee Pokemon.csv en un DataFrame


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,2,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,3,False
3,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
4,5,Charmeleon,Fire,,405,58,64,58,80,65,80,2,False


In [3]:
# Muestra las columnas HP, Attack y Defense


Unnamed: 0,HP,Attack,Defense
0,45,49,49
1,60,62,63
2,80,82,83
3,39,52,43
4,58,64,58


In [4]:
# Muestra una descripción de esas columnas


Unnamed: 0,HP,Attack,Defense
count,151.0,151.0,151.0
mean,64.211921,72.549669,68.225166
std,28.590117,26.596162,26.916704
min,10.0,5.0,5.0
25%,45.0,51.0,50.0
50%,60.0,70.0,65.0
75%,80.0,90.0,84.0
max,250.0,134.0,180.0


## Counts

Load the song_views.csv dataset and understand the features.

In [5]:
# Lee song_views.csv y visualízalo en un DataFrame


Unnamed: 0,user_id,song_id,title,listen_count
0,b6b799f34a204bd928ea014c243ddad6d0be4f8f,SOBONKR12A58A7A7E0,You're The One,2
1,b41ead730ac14f6b6717b9cf8859d5579f3f8d4d,SOBONKR12A58A7A7E0,You're The One,0
2,4c84359a164b161496d05282707cecbd50adbfc4,SOBONKR12A58A7A7E0,You're The One,0
3,779b5908593756abb6ff7586177c966022668b06,SOBONKR12A58A7A7E0,You're The One,0
4,dd88ea94f605a63d9fc37a214127e3f00e85e42d,SOBONKR12A58A7A7E0,You're The One,0


# Binarization

Often raw frequencies or counts may not be relevant for building a model based on the problem which is being solved. For instance if I’m building a recommendation system for song recommendations, I would just want to know if a person is interested or has listened to a particular song. This doesn’t require the number of times a song has been listened to since I am more concerned about the various songs he\she has listened to. In this case, a binary feature is preferred as opposed to a count based feature. Add a column that includes this information, with a new column watched, that takes the value 1, when the listen count is >0


In [6]:
# en el DataFrame de canciones, añade una columna que indique con el valor 1 si esa canción se ha escuchado alguna vez


In [7]:
# Muestra un head para ver tus resultados


Unnamed: 0,user_id,song_id,title,listen_count,listened
0,b6b799f34a204bd928ea014c243ddad6d0be4f8f,SOBONKR12A58A7A7E0,You're The One,2,True
1,b41ead730ac14f6b6717b9cf8859d5579f3f8d4d,SOBONKR12A58A7A7E0,You're The One,0,False
2,4c84359a164b161496d05282707cecbd50adbfc4,SOBONKR12A58A7A7E0,You're The One,0,False
3,779b5908593756abb6ff7586177c966022668b06,SOBONKR12A58A7A7E0,You're The One,0,False
4,dd88ea94f605a63d9fc37a214127e3f00e85e42d,SOBONKR12A58A7A7E0,You're The One,0,False


## Binarization with sklearn

Look at the documentation of sklearn preprecessing. Specifically to the Binarizer method. Try to use this method to obtainn a binarization of the song_views dataset.

In [8]:
# Busca documentación sobre el preprocesado de sklearn (en concreto, Binarizer)
from sklearn.preprocessing import Binarizer

# Binarizer tiene la frontera en 0 por defecto, pero lo ponemos para practicar


Binarizer(copy=True, threshold=0)

# Rounding

Load the item_popularity.csv dataset and understand the features.

Unnamed: 0,item_id,pop_percent
0,it_01345,0.98324
1,it_03431,0.56123
2,it_04572,0.12098
3,it_98021,0.35476
4,it_01298,0.92101


Include new columns in the dataset showing a popularity scale of 100 and 1000, being those 2 columns integer numbers.

In [17]:
item_df

Unnamed: 0,item_id,pop_percent,pop_100,pop_1000
0,it_01345,0.98324,98.32,983.24
1,it_03431,0.56123,56.12,561.23
2,it_04572,0.12098,12.1,120.98
3,it_98021,0.35476,35.48,354.76
4,it_01298,0.92101,92.1,921.01
5,it_90120,0.81212,81.21,812.12
6,it_10123,0.56502,56.5,565.02


# Interactions

Load the pokemon dataset. Build a new data set including only 'Attack' and 'Defense'.

In [18]:
pokemon_df = pd.read_csv('Pokemon.csv', encoding='latin-1')
pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Stage,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,2,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,3,False
3,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
4,5,Charmeleon,Fire,,405,58,64,58,80,65,80,2,False


In [20]:
pokemon_df_ad.head()

Unnamed: 0,Attack,Defense
0,49,49
1,62,63
2,82,83
3,52,43
4,64,58


Build a new dataframe using the PolynomialFeatures method in sklearn.preprocesing. Use a degree 2 polynomic function. Try to understand what is happening.

In [21]:
from sklearn.preprocessing import PolynomialFeatures


['1', 'x0', 'x1', 'x0 x1']

In [25]:
# Lo que estamos calculando es el Ataque x Defensa, es decir, una medida de fortaleza del pokemon