# Project in Mahine Learning

Loading packages

In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import os
from os.path import join as pjoin

import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [2]:
def discretizeAttribution(df, features, excluders):
    df_copy = df.copy()
    for feature in features:
        index = 0
        dic = {}
        for items in df.get(feature):
            items = items.split(",")
            for item in items:
                if item not in dic and item not in excluders:
                    dic[item] = index
                    index += 1
        newFeature = []
        for items in df.get(feature):
            items = items.split(",")
            newFeature.append(np.zeros(len(dic)))
            for item in items:
                if item not in excluders:
                    newFeature[-1][dic[item]] = 1
        
        df_copy[feature] = newFeature
    return df_copy

In [3]:
def strToFloatArray(df, *features):
    df_copy = df.copy()
    for feature in features:
        newFeature  = []
        for item, index in zip(df[feature], range(len(df[feature]))):
            newFeature.append(np.array(item[1:-1].split(",")).astype(float))
        df_copy[feature] = newFeature
    return df_copy
        

def conserveNPC(df, feature, N):
    df_copy = df.copy()
    temp = preprocessing.scale(np.vstack(df_copy[feature]))
    pca = PCA(n_components = N)
    pca.fit(temp)
    df_copy[feature] = pca.transform(temp).tolist()
    return df_copy

def splitArrayIntoColumns(df, *features):
    df_copy = df.copy()
    for feature in features:
        newFeature = np.vstack(np.array(df_copy[feature]))
        for index in range(len(newFeature[0])):
            df_copy[feature + str(index)] = newFeature[:, index]
        df_copy.drop(feature, axis = 1, inplace = True)
    return df_copy

In [4]:
def standardize(df, *excludedFeatures):
    df_copy = df.copy()
    for feature in df.keys():
        excluded = False
        for excludedFeature in excludedFeatures:
            if excludedFeature in feature:
                excluded = True
        if not excluded:
            df_copy[feature] = (df_copy[feature] - df_copy[feature].mean()) / df_copy[feature].std()
    return df_copy

def getCorrelatedFeatures(df, threshold = 0.8):
    corr = df.corr()
    feature_to_delete = []
    corr_max = 0
    for feature in df.keys():
        for feature2 in df.keys():
            if feature == feature2 :
                break

            if abs(corr[feature][feature2]) > threshold :
                feature_to_delete.append((feature, feature2))
            if abs(corr[feature][feature2]) > corr_max:
                corr_max = abs(corr[feature][feature2])

    print(feature_to_delete)
    print(corr_max)
    
def removeFeatureWithMI(df, target, N, worst=True):
    MI = mutual_info_regression(df, target)
    sorted_arg = np.flip(np.array(np.argsort(MI)))
    df_copy = df.copy()
    if worst:
        df_copy = df_copy[df_copy.keys()[sorted_arg[:-N]]]
    else:
        df_copy = df_copy[df_copy.keys()[sorted_arg[:N]]]
    return df_copy

In [5]:
def removeOutliers(df, target, *excludedFeatures):
    df_copy = df.copy()
    target_copy = target.copy()
    threshold = 1.5
    for feature in df_copy.keys():
        excluded = False
        for excludedFeature in excludedFeatures :
            if excludedFeature in feature:
                excluded = True
        if not excluded :
            mean = np.mean(df_copy[feature])
            qi, qf = np.quantile(df_copy[feature], [0.25, 0.75])
            IQR = abs(qf - qi)
            target_copy.loc[df_copy[feature] > qf + threshold * IQR] = None
            target_copy.loc[df_copy[feature] < qi - threshold * IQR] = None
            df_copy.loc[df_copy[feature] > qf + threshold * IQR, feature] = None
            df_copy.loc[df_copy[feature] < qi - threshold * IQR, feature] = None

    df_copy = df_copy.dropna()
    target_copy = target_copy.dropna()
    
    return df_copy, target_copy

In [6]:
def open_data(path):
    X1path = pjoin(path, "X1.csv")
    X2path = pjoin(path, "X2.csv")
    Y1path = pjoin(path, "Y1.csv")

    X1 = pd.read_csv(X1path, na_values="\\N")
    X2 = pd.read_csv(X2path, na_values="\\N")
    Y1 = pd.read_csv(Y1path, na_values="\\N", header=None)
    
    return X1, X2, Y1

Loading data and first visualization

In [7]:
try :
    X1, X2, Y1 = open_data("C:\\Users\\Louis Lovat\\Desktop\\UNIF\\MachineLearning\\Project\\data")
except :
    X1, X2, Y1 = open_data("C:\\Users\\hchri\\Jupyter\\Machine learning\\project_data")


# Replacing NA values
    
for feature in X2.keys():
    print(feature, "\t", X2[feature].isna().sum())
    
print("Y1 ", Y1.isna().sum())

X1["runtime"].fillna(X1["runtime"].mean(), inplace=True)
print("runtime mean = ", X1["runtime"].mean())
X1["genres"].fillna("Unknown", inplace=True)

X2["runtime"].fillna(X2["runtime"].mean(), inplace=True)
print("runtime mean = ", X2["runtime"].mean())
X2["genres"].fillna("Unknown", inplace=True)

for feature in X2.keys():
    print(feature, "\t", X2[feature].isna().sum())
    

# Get rid of useless/unusable features
    
X1.drop(["img_url", "description", "title", "is_adult"], axis=1, inplace=True)
X2.drop(["img_url", "description", "title", 'is_adult'], axis=1, inplace=True)

# Re-working studio, genres and embeddings to make them usable

X1 = discretizeAttribution(X1, ["genres", "studio"], ["Unknown"])
X1 = strToFloatArray(X1, "img_embeddings", "text_embeddings")
X1 = conserveNPC(X1, "img_embeddings", 100)
X1 = conserveNPC(X1, "text_embeddings", 9)
X1 = splitArrayIntoColumns(X1, "img_embeddings", "text_embeddings", "studio", "genres")
X1 = standardize(X1, 'is_adult', 'studio', 'genre')
getCorrelatedFeatures(X1, threshold = 0.8)
X1 = removeFeatureWithMI(X1, Y1, 50)
X1, Y1 = removeOutliers(X1, Y1, 'studio', 'genre', 'text_embeddings', 'img_embeddings')

Unnamed: 0 	 0
title 	 0
img_url 	 0
description 	 0
ratings 	 0
n_votes 	 0
is_adult 	 0
production_year 	 0
runtime 	 106
genres 	 4
release_year 	 0
studio 	 0
img_embeddings 	 0
text_embeddings 	 0
Y1  0    0
dtype: int64
runtime mean =  92.29334554334555
runtime mean =  92.01983002832863
Unnamed: 0 	 0
title 	 0
img_url 	 0
description 	 0
ratings 	 0
n_votes 	 0
is_adult 	 0
production_year 	 0
runtime 	 0
genres 	 0
release_year 	 0
studio 	 0
img_embeddings 	 0
text_embeddings 	 0
[]
0.7069068923398097


  return f(**kwargs)


In [8]:
X1

Unnamed: 0.1,n_votes,Unnamed: 0,release_year,production_year,runtime,text_embeddings7,ratings,img_embeddings81,img_embeddings24,img_embeddings78,...,studio265,studio261,studio260,studio255,studio247,studio171,studio169,studio168,studio350,studio352
1,-0.271738,0.640351,1.366766,-0.073379,-4.528743e-16,0.069660,1.263649,1.732888,-0.751494,0.853321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.215444,-0.711136,-0.798529,-0.242604,4.049382e-01,-0.021932,0.148729,0.546635,0.157666,1.269518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.265480,-0.332204,-2.097705,-1.257954,5.005428e-01,1.287306,-1.995349,-1.034679,-1.038747,0.338826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-0.271075,-1.718208,-0.148940,-2.019467,-1.029131e+00,-0.364239,1.949754,0.811371,-0.064291,-2.302796,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.264364,0.915351,-1.231588,-0.581054,3.412018e-01,0.716340,-1.223481,-0.433518,0.469388,0.761941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3532,-0.266178,1.140662,0.392383,0.603521,8.625620e-02,-1.200148,0.577544,1.198356,-1.554367,-1.084819,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3533,-0.266771,-0.985378,0.284118,0.603521,-9.348394e-03,-0.274005,-0.537376,-0.195285,-0.055535,0.573543,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3534,-0.223237,1.443732,0.717177,0.941972,5.642792e-01,-1.338271,0.406018,1.095136,1.085463,-0.738080,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3535,-0.260549,-0.668653,-0.257205,0.011234,5.438800e-02,0.457046,-0.108561,0.367848,-0.628609,-0.881996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:

        
"""#sns.heatmap(abs(corr))

#print(X1['ratings'].to_frame())
corr = X1['ratings'].to_frame().corrwith(Y1)
sorted_arg = np.flip(np.array(np.argsort(corr)))
# print(sorted_arg)
print(corr)
# print(Y1)"""


"#sns.heatmap(abs(corr))\n\n#print(X1['ratings'].to_frame())\ncorr = X1['ratings'].to_frame().corrwith(Y1)\nsorted_arg = np.flip(np.array(np.argsort(corr)))\n# print(sorted_arg)\nprint(corr)\n# print(Y1)"