# Cas Kaggle: 
## Preferències d'una persona en pelicules a partir de la seva personalitat.

Nom: Gerard Asbert Marcos

Niu: 1603295

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer

dataset0 = pd.read_csv("2018-personality-data.csv")
dataset0.drop(columns=dataset0.columns[-26:], axis=1, inplace=True)
dataset1 = pd.read_csv("2018_ratings.csv")
dataset1.drop(columns=dataset1.columns[-1], axis=1, inplace=True)

dataset1.head() 


In [None]:
dataset0.head()

In [None]:
dataset = pd.merge(dataset1, dataset0, on='userid')
dataset.rename(columns={' movie_id': 'movie_id', ' rating': 'rating', ' openness': 'openness', ' agreeableness': 'agreeableness', ' emotional_stability': 'emotional_stability', ' conscientiousness': 'conscentiousness', ' extraversion': 'extraversion', ' assigned metric': 'assigned metric', ' assigned condition': 'assigned condition'}, inplace=True)
dataset.head()

In [None]:
print(dataset.columns.tolist())

Ara que ja tenim les dades que considerem necesàries en un sol dataframe, el primer que farem serà passar els atributs 'assigned metric' i 'assigned condition' a valors numerics.

assigned metric: serendipity, popularity, diversity, all
asigned condition: high, medium, low, default

In [None]:
encoder = LabelBinarizer()
encoder.fit(dataset['assigned metric'])
transformed = encoder.transform(dataset['assigned metric'])
data0 = pd.DataFrame(transformed)
dataset = pd.concat([dataset, data0], axis=1).drop(['assigned metric'], axis=1)
dataset.rename(columns={0: 'all', 1: 'diversity', 2: 'popularity', 3: 'serendipity'}, inplace=True)
dataset.head()

In [None]:
encoder = LabelBinarizer()
encoder.fit(dataset['assigned condition'])
transformed = encoder.transform(dataset['assigned condition'])
data0 = pd.DataFrame(transformed)
dataset = pd.concat([dataset, data0], axis=1).drop(['assigned condition'], axis=1)
dataset.rename(columns={0: 'default', 1: 'high', 2: 'low', 3: 'medium'}, inplace=True)
dataset.drop(["default", "userid"], axis=1, inplace=True)

dataset.head()

Com l'únic que ens interessa de la persona són les dades sobre la seva personalitat, ens hem desfet del userId. També ens hem desfet de l'atribut default ja que és el mateix que l'atribut all.

També, passem l'atribut objectiu 'rating' a ints, per poder-los utilitzar com a classes, i així convertint-ho en un problema de classificació.

In [None]:
dataset = dataset.replace({'rating': {0.5: 1, 1: 2, 1.5: 3, 2: 4, 2.5: 5, 3: 6, 3.5: 7, 4: 8, 4.5: 9, 5: 10}})
dataset = dataset.astype({'rating':'int'})

data = dataset.values
x = data[:, np.array([True, False, True, True, True, True, True, True, True, True, True, True, True, True])]
y = data[:, 1]

dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe().apply(lambda s: s.apply('{0:.2f}'.format))

### Heatmaps + Pairplots

In [None]:
co = dataset.corr()
plt.figure(figsize=[10, 8])
sns.heatmap(np.abs(co), annot=True, linewidths=0.5, annot_kws={"size":8})

In [None]:
dataset.hist(figsize=[12, 12], ylabelsize=8, )
#rel = sns.pairplot(dataset)

In [None]:
rel = sns.pairplot(dataset)

### Normalització + Crossvalidation

Primer escollirem el tipus de normalització que volem utilitzar en el nostre dataset entre aquestes 8 opcions: Sense Normalització, Minmax Scaler, Standard Scaler, Robust Scaler, Yeo-Johnson Scaler (PowerTransformer), Quantile Transformer (Uniform), Quantile Transformer (Gaussian), Normalizer. Per mesurar quina normalització és la millor, els hi aplicarem una simple regressió logistica i veurem quina dona una millor accuracy. 

In [None]:
dataset['rating'].value_counts()

Com es pot veure les dades están desbalançejades, ja que hi ha aproximadament 11 vegades més instáncies en ratings de 8 que en ratings de 1, i per lo tant utilitzaré un stratified k-fold.

In [None]:
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.model_selection
from statistics import mean

x = x.astype(float)
y = y.astype(float)

minmax = sklearn.preprocessing.MinMaxScaler()

standard = sklearn.preprocessing.StandardScaler()

robust = sklearn.preprocessing.RobustScaler()

yeoJohnson = sklearn.preprocessing.PowerTransformer()

quantileUniform = sklearn.preprocessing.QuantileTransformer()

quantileGaussian = sklearn.preprocessing.QuantileTransformer(output_distribution='normal')

normalizer = sklearn.preprocessing.Normalizer()


logReg = sklearn.linear_model.LogisticRegression(max_iter=1000)

skf = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
accuracy_sense = []
accuracy_minmax = []
accuracy_standard = []
accuracy_robust = []
accuracy_yeoJohnson = []
accuracy_quantUniform = []
accuracy_quantGaussian = []
accuracy_normalizer = []

'''
#Sense normalització
for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_sense.append(logReg.score(x_test, y_test))

print('\nSense Scaler:')
print('\nMaximum Accuracy that can be obtained from this model is:',
      max(accuracy_sense)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_sense)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_sense)*100, '%')
    

#MinMax

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = minmax.fit_transform(x_train)
    x_test = minmax.transform(x_test)
    
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_minmax.append(logReg.score(x_test, y_test))

print('\nMinMax Scaler')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_minmax)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_minmax)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_minmax)*100, '%')


#Standard_Scaler

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    x_train = standard.fit_transform(x_train)
    x_test = standard.transform(x_test)
    logReg.fit(x_train, y_train)
    accuracy_standard.append(logReg.score(x_test, y_test))

print('\nStandard Scaler:')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_standard)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_standard)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_standard)*100, '%')

#Robust_Scaler

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    
    x_train = robust.fit_transform(x_train)
    x_test = robust.transform(x_test)
    
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_robust.append(logReg.score(x_test, y_test))

print('\nRobust Scaler:')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_robust)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_robust)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_robust)*100, '%')
'''

#Yeo-Johson

for train_index, test_index in skf.split(x, y):   
    x_train, x_test = x[train_index], x[test_index]
    
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_yeoJohnson.append(logReg.score(x_test, y_test))

print('\nYeo-Johnson Scaler:')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_yeoJohnson)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_yeoJohnson)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_yeoJohnson)*100, '%')

'''
#Quantile Transformer (Uniform)

for train_index, test_index in skf.split(x, y):   
    x_train, x_test = x[train_index], x[test_index]
    
    x_train = quantileUniform.fit_transform(x_train)
    x_test = quantileUniform.transform(x_test)
    
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_quantUniform.append(logReg.score(x_test, y_test))

print('\nQuantile Transformer (Uniform):')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_quantUniform)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_quantUniform)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_quantUniform)*100, '%')


#Quantile Transformer (Gaussian)

for train_index, test_index in skf.split(x, y):   
    x_train, x_test = x[train_index], x[test_index]
    
    x_train = quantileGaussian.fit_transform(x_train)
    x_test = quantileGaussian.transform(x_test)
    
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_quantGaussian.append(logReg.score(x_test, y_test))

print('\nQuantile Transformer (Gaussian):')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_quantGaussian)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_quantGaussian)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_quantGaussian)*100, '%')



#Normalizer

for train_index, test_index in skf.split(x, y):   
    x_train, x_test = x[train_index], x[test_index]
    
    x_train = normalizer.fit_transform(x_train)
    x_test = normalizer.transform(x_test)
    
    y_train, y_test = y[train_index], y[test_index]
    logReg.fit(x_train, y_train)
    accuracy_normalizer.append(logReg.score(x_test, y_test))

print('\nNormalizer:')
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(accuracy_normalizer)*100, '%')
print('\nMinimum Accuracy:',
      min(accuracy_normalizer)*100, '%')
print('\nOverall Accuracy:',
      mean(accuracy_normalizer)*100, '%')
'''

La que ha donat millor resultat ha sigut la normalització Yeo-Johnson, per lo tant comentaré totes les altres perque s'executi més rapid, i mostraré els resultats obtinguts amb els altres tipus de normalització a la memória.

### PCA

In [None]:
from sklearn.decomposition import PCA 

pca = PCA(n_components=dataset.shape[1])
pca.fit(dataset)

print(pca.explained_variance_ratio_)
print(pca.components_)

El que podem observar en els resultats del PCA es que el primer Principal Component és basicament l'unic que importa ja que te moltíssima més variancia que el segon i els altres, i que l'atribut que aporta la majoria de variancia a aquest primer principal component és el movie_id.

### Feature Selection

In [None]:
'''
from sklearn.feature_selection import RFE

selector = RFE(logReg, n_features_to_select=10)
selector = selector.fit(x, y)
selector.support_
x_train = selector.transform(x_train)
x_test = selector.transform(x_test)
logReg.fit(x_train, y_train)
print(logReg.score(x_test, y_test))
'''

He provat de fer feature selection amb el wrapper RFE, pero el resultat ha donat pitjor del que teniem abans (per lo tant ho comentarem ja que tarda 12 minuts en acabar). També provaré intrinsic feature selection, pero ho faré en la model selection provant el random forest classifier.

### Outlier Removal

Hem provat diferents algoritmes d'outlier removal amb l'esperança de millorar l'accuracy i de reduïr la mida de les dades. Aquests algoritmes son: Isolation Forest, OneClassSvm, OneClassSvm(polynomial), LocalOutlierFactor, LocalOutlierFactor(BallTree) i Elliptic Envelope. (Com han tardat molt en executar-se i tots han donat una accuracy pitjor a la que teniem amb outliers, els deixem comentats, i mostrem els resultats a la memòria). 

Primer observem les dades amb histogrames ara que ja estan normalitzades.

In [None]:
df2 = pd.DataFrame(x_train, columns = ['movie_id','openness','agreeableness','emotional_stability','conscentiousness','extraversion','all','diversity','popularity','serendipity','high','low','medium'])
df2.hist(figsize=[12, 12], ylabelsize=8, )

In [None]:

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

isoForest = IsolationForest()
svm = OneClassSVM()
svmPoly = OneClassSVM(kernel="poly")
localOutlier = LocalOutlierFactor()
localBallTree = LocalOutlierFactor(algorithm="ball_tree")
ellipEnvelope = EllipticEnvelope()
'''
outliersIso = isoForest.fit_predict(x_train)
outliersSvm = svm.fit_predict(x_train)
outliersSvmPoly = svmPoly.fit_predict(x_train)
outliersLocal = localOutlier.fit_predict(x_train)
outliersBallTree = localBallTree.fit_predict(x_train)
outliersElliptic = ellipEnvelope.fit_predict(x_train)


print(f"Shape amb outliers: {x_train.shape}")
x_trainIso = x_train[outliersIso == 1]
y_trainIso = y_train[outliersIso == 1]
print(f"Shape després de IsolationForest: {x_trainIso.shape}")
x_trainSvm = x_train[outliersSvm == 1]
y_trainSvm = y_train[outliersSvm == 1]
print(f"Shape després de OneClassSVM(rbf): {x_trainSvm.shape}")
x_trainSvmPoly = x_train[outliersSvmPoly == 1]
y_trainSvmPoly = y_train[outliersSvmPoly == 1]
print(f"Shape després de OneClassSVM(poly): {x_trainSvmPoly.shape}")
x_trainLocal = x_train[outliersLocal == 1]
y_trainLocal = y_train[outliersLocal == 1]
print(f"Shape després de LocalOutlierFactor(auto): {x_trainLocal.shape}")
x_trainBallTree = x_train[outliersBallTree == 1]
y_trainBallTree = y_train[outliersBallTree == 1]
print(f"Shape després de LocalOutlierFactor(ball_tree): {x_trainBallTree.shape}")
x_trainElliptic = x_train[outliersElliptic == 1]
y_trainElliptic = y_train[outliersElliptic == 1]
print(f"Shape després de EllipticEnvelope: {x_trainElliptic.shape}")

x_train= x_trainElliptic
y_trian = y_trainElliptic
'''

In [None]:
'''
logReg.fit(x_trainIso, y_trainIso)
print(f"Accuracy amb IsolationForest: {logReg.score(x_test, y_test)}")

logReg.fit(x_trainSvm, y_trainSvm)
print(f"Accuracy amb OneClassSVM(rbf): {logReg.score(x_test, y_test)}")

logReg.fit(x_trainSvmPoly, y_trainSvmPoly)
print(f"Accuracy amb OneClassSVM(poly): {logReg.score(x_test, y_test)}")

logReg.fit(x_trainLocal, y_trainLocal)
print(f"Accuracy amb LocalOutlierFactor(auto): {logReg.score(x_test, y_test)}")

logReg.fit(x_trainBallTree, y_trainBallTree)
print(f"Accuracy amb LocalOutlierFactor(ball_tree): {logReg.score(x_test, y_test)}")

logReg.fit(x_trainElliptic, y_trainElliptic)
print(f"Accuracy amb EllipticEnvelope: {logReg.score(x_test, y_test)}")
'''

In [None]:
'''
print(f"Shape amb outliers: {x_train.shape}")
x_trainIso = x_train[outliersIso == 1]
y_trainIso = y_train[outliersIso == 1]
print(f"Shape després de IsolationForest: {x_trainIso.shape}")
x_trainSvm = x_train[outliersSvm == 1]
y_trainSvm = y_train[outliersSvm == 1]
print(f"Shape després de OneClassSVM(rbf): {x_trainSvm.shape}")
x_trainSvmPoly = x_train[outliersSvmPoly == 1]
y_trainSvmPoly = y_train[outliersSvmPoly == 1]
print(f"Shape després de OneClassSVM(poly): {x_trainSvmPoly.shape}")
x_trainLocal = x_train[outliersLocal == 1]
y_trainLocal = y_train[outliersLocal == 1]
print(f"Shape després de LocalOutlierFactor(auto): {x_trainLocal.shape}")
x_trainBallTree = x_train[outliersBallTree == 1]
y_trainBallTree = y_train[outliersBallTree == 1]
print(f"Shape després de LocalOutlierFactor(ball_tree): {x_trainBallTree.shape}")
x_trainElliptic = x_train[outliersElliptic == 1]
y_trainElliptic = y_train[outliersElliptic == 1]
print(f"Shape després de EllipticEnvelope: {x_trainElliptic.shape}")
'''

Com dona millor accuracy sense Outlier Removal, no aplicarem ningún dels mètodes provats.

### Model Selection

He provat els models classificadors: Logistic Regression, Svc, Linear Svc, Svc(rbf), Svc(Polynomial), KNeighbors Classifier, i Perceptron. 

Petit comentari: Els models que he deixat comentats son els que tarden uns dies per si ho tens que executar. En la memória ja enseño els resultats + el temps que han tardat.

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

svc = SVC(kernel="linear") 
linearsvc = LinearSVC(max_iter=100000)
svcRBF = SVC(kernel="rbf")
svcPoly = SVC(kernel="poly")
nn = KNeighborsClassifier()
percep = Perceptron()

In [None]:

accs = []
f1s = []
precs = []
recs = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    y_train, y_test = y[train_index], y[test_index]
        
    logReg.fit(x_train, y_train)
    preds = logReg.predict(x_test)
    
    accs.append(accuracy_score(y_test, preds))
    f1s.append(f1_score(y_test, preds, average="micro"))
    precs.append(precision_score(y_test, preds, average="micro"))
    recs.append(recall_score(y_test, preds, average="micro"))
    
acc = mean(accs)
f1 = mean(f1s)
prec = mean(precs)
rec = mean(recs)
    
print(f"Logistic Regression: Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
print("")


In [None]:
'''
accs = []
f1s = []
precs = []
recs = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    y_train, y_test = y[train_index], y[test_index]
        
    svc.fit(x_train, y_train)
    preds = svc.predict(x_test)

    accs.append(accuracy_score(y_test, preds))
    f1s.append(f1_score(y_test, preds, average="micro"))
    precs.append(precision_score(y_test, preds, average="micro"))
    recs.append(recall_score(y_test, preds, average="micro"))
    
acc = mean(accs)
f1 = mean(f1s)
prec = mean(precs)
rec = mean(recs)

print(f"SVC: Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
print("")
'''

In [None]:

accs = []
f1s = []
precs = []
recs = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    y_train, y_test = y[train_index], y[test_index]
    
    linearsvc.fit(x_train, y_train)
    preds = linearsvc.predict(x_test)

    accs.append(accuracy_score(y_test, preds))
    f1s.append(f1_score(y_test, preds, average="micro"))
    precs.append(precision_score(y_test, preds, average="micro"))
    recs.append(recall_score(y_test, preds, average="micro"))
    
acc = mean(accs)
f1 = mean(f1s)
prec = mean(precs)
rec = mean(recs)
    
print(f"linearSVC: Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
print("")


In [None]:
'''
accs = []
f1s = []
precs = []
recs = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    y_train, y_test = y[train_index], y[test_index]
        
    svcRBF.fit(x_train, y_train)
    preds = svcRBF.predict(x_test)

    accs.append(accuracy_score(y_test, preds))
    f1s.append(f1_score(y_test, preds, average="micro"))
    precs.append(precision_score(y_test, preds, average="micro"))
    recs.append(recall_score(y_test, preds, average="micro"))
    
acc = mean(accs)
f1 = mean(f1s)
prec = mean(precs)
rec = mean(recs)
    
print(f"SVC(rbf): Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
print("")
'''

In [None]:
'''
accs = []
f1s = []
precs = []
recs = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    y_train, y_test = y[train_index], y[test_index]
        
    svcPoly.fit(x_train, y_train)
    preds = svcPoly.predict(x_test)

    accs.append(accuracy_score(y_test, preds))
    f1s.append(f1_score(y_test, preds, average="micro"))
    precs.append(precision_score(y_test, preds, average="micro"))
    recs.append(recall_score(y_test, preds, average="micro"))
    
acc = mean(accs)
f1 = mean(f1s)
prec = mean(precs)
rec = mean(recs)
    
print(f"SVC(Polynomial): Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
print("")
'''

In [None]:

knns = dict()
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
    accs = []
    f1s = []
    precs = []
    recs = []

    nn = KNeighborsClassifier(n_neighbors=k)
    for train_index, test_index in skf.split(x, y):
        x_train, x_test = x[train_index], x[test_index]
        x_train = yeoJohnson.fit_transform(x_train)
        x_test = yeoJohnson.transform(x_test)
        y_train, y_test = y[train_index], y[test_index]
        
        nn.fit(x_train, y_train)
        preds = nn.predict(x_test)
        
        accs.append(accuracy_score(y_test, preds))
        f1s.append(f1_score(y_test, preds, average="micro"))
        precs.append(precision_score(y_test, preds, average="micro"))
        recs.append(recall_score(y_test, preds, average="micro"))
    
    acc = mean(accs)
    f1 = mean(f1s)
    prec = mean(precs)
    rec = mean(recs)
    print(f"Nearest Neighbors(k = {k}): Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
    print("")
    
    knns[k] = nn
    

In [None]:

accs = []
f1s = []
precs = []
recs = []

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    x_train = yeoJohnson.fit_transform(x_train)
    x_test = yeoJohnson.transform(x_test)
    y_train, y_test = y[train_index], y[test_index]
        
    percep.fit(x_train, y_train)
    preds = percep.predict(x_test)

    accs.append(accuracy_score(y_test, preds))
    f1s.append(f1_score(y_test, preds, average="micro"))
    precs.append(precision_score(y_test, preds, average="micro"))
    recs.append(recall_score(y_test, preds, average="micro"))
    
acc = mean(accs)
f1 = mean(f1s)
prec = mean(precs)
rec = mean(recs)
    
print(f"Perceptron: Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
print("")


In [None]:

from sklearn.ensemble import RandomForestClassifier

# sense k-fold perque va extremadament lent
rfcs = dict()
for t in range(10, 201, 20):
    rfc = RandomForestClassifier(n_estimators=t)
    rfc.fit(x_train, y_train)
    preds = rfc.predict(x_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="micro")
    prec = precision_score(y_test, preds, average="micro")
    rec = recall_score(y_test, preds, average="micro")
    
    print(f"Random Forest(trees = {t}): Accuracy = {acc:.3f}; f1_score = {f1:.3f}; Precision = {prec:.3f}; Recall = {rec:.3f};")
    print("")
    
    rfcs[t] = rfc
