In [2]:
import pandas as pd
# DATA MANIPULATION
import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
import datetime

# STATS
from statsmodels.graphics.gofplots import qqplot

# DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns

# ML
## PREPROC
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
## METRICS
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
## SUPERVISED MODEL
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
## UNSUPERVISED
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
## TUNING
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

df = pd.read_csv("/home/clement/code/Fholklo/GameForecast/raw_data/games_infos.csv")

In [3]:
import os

##################  VARIABLES  ##################
MODEL_TARGET = os.environ.get("MODEL_TARGET")
GCP_PROJECT = os.environ.get("GCP_PROJECT")
GCP_PROJECT_WAGON = os.environ.get("GCP_PROJECT_WAGON")
GCP_REGION = os.environ.get("GCP_REGION")
BQ_DATASET = os.environ.get("BQ_DATASET")
BQ_REGION = os.environ.get("BQ_REGION")
BUCKET_NAME = os.environ.get("BUCKET_NAME")
INSTANCE = os.environ.get("INSTANCE")

PREFECT_FLOW_NAME = os.environ.get("PREFECT_FLOW_NAME")
PREFECT_LOG_LEVEL = os.environ.get("PREFECT_LOG_LEVEL")

GAR_IMAGE = os.environ.get("GAR_IMAGE")
GAR_MEMORY = os.environ.get("GAR_MEMORY")

############## CONSTANTS ###################
#Path
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".lewagon", "gameforecast", "data")
LOCAL_REGISTRY_PATH =  os.path.join(os.path.expanduser('~'), ".lewagon", "gameforecast", "training_outputs")



#Data
genre_options = ['Action', 'Casual', 'Indie', 'RPG', 'Simulation', 'Adventure',
                 'Strategy', 'Design & Illustration', 'Video Production',
                 'Early Access', 'Massively Multiplayer', 'Free to Play', 'Sports',
                 'Animation & Modeling', 'Utilities', 'Game Development',
                 'Photo Editing', 'Software Training', 'Nudity', 'Violent',
                 'Racing', 'Gore', 'Sexual Content', 'Audio Production',
                 'Web Publishing', 'Movie', 'Education', 'Accounting']
category_options = ['Single-player', 'Steam Cloud', 'Family Sharing', 'Steam Achievements',
                    'Partial Controller Support', 'Full controller support', 'Multi-player',
                    'Steam Trading Cards', 'Steam Workshop', 'Co-op', 'Online Co-op',
                    'Steam Leaderboards', 'PvP', 'Online PvP', 'Remote Play on Phone',
                    'Remote Play on Tablet', 'Remote Play on TV', 'In-App Purchases',
                    'Tracked Controller Support', 'VR Only', 'MMO', 'Cross-Platform Multiplayer',
                    'Stats', 'Includes level editor', 'Shared/Split Screen',
                    'Remote Play Together', 'No', 'VR Supported', 'Captions available',
                    'VR Support', 'Shared/Split Screen PvP', 'Shared/Split Screen Co-op',
                    'Valve Anti-Cheat enabled', 'LAN Co-op', 'Steam Turn Notifications',
                    'HDR available', 'LAN PvP', 'Commentary available', 'Includes Source SDK',
                    'SteamVR Collectibles', 'Mods', 'Mods (require HL2)']

languages_options = ["German", "French", "Italian", 'Spanish - Spain', "Portuguese - Portugal", 'English',
                     'Simplified Chinese', 'Russian', 'Japanese', 'Korean', 'Traditional Chinese',
                     'Portuguese - Brazil', 'Polish', 'Turkish']

required_fields = ['App_ID', 'Developers', 'Publishers', 'Achievements', 'Price']


#Preprocess
FEATURE_SELECTION_V1 = ["App_ID","Release_Date","Price","Supported_Languages","Support_URL","Windows","Mac","Linux","Achievements","Developers","Publishers","Categories","Genres","Positive","Negative"]
FEATURE_SELECTION_V2 = ["App_ID","Name","Release_Date","Price","About_The_Game","Supported_Languages","Header_Image","Support_URL","Windows","Mac","Linux","Positive","Negative","Achievements","Developers","Publishers","Categories","Genres","Tags","Screenshots","Movies"]

UNIQUE_LANGUAGE = ["english",'French', 'german', 'italian', 'spanish - spain',
       'spanish - latin america', 'simplified chinese',
       'traditional chinese', 'russian', 'japanese', 'korean',
       'portuguese - brazil', 'turkish', 'welsh', 'vietnamese', 'danish',
       'portuguese - portugal', 'dutch', 'polish', 'czech', 'ukrainian',
       'arabic', 'bulgarian', 'hungarian', 'greek', 'norwegian',
       'romanian', 'thai', 'finnish', 'swedish', 'croatian', 'estonian',
       'hebrew', 'icelandic', 'latvian', 'lithuanian', 'maori', 'slovak',
       'slovenian', 'indonesian', 'serbian', 'uzbek', 'urdu', 'armenian',
       'igbo', 'sindhi', 'sinhala', 'cherokee', 'galician', 'catalan',
       'afrikaans', 'kannada', 'luxembourgish', 'hindi', 'gujarati',
       'kyrgyz', 'kazakh', 'turkmen', 'kinyarwanda',
       'tajik', 'odia', 'konkani', 'bangla', 'nepali', 'basque',
       'tigrinya', 'swahili', 'punjabi (gurmukhi)', 'punjabi (shahmukhi)',
       'georgian', 'wolof', 'bosnian', 'persian', 'telugu', 'tamil',
       'irish', 'valencian', 'belarusian', 'quechua', 'zulu', 'xhosa',
       'sotho', 'sorani', 'yoruba', 'uyghur', 'scots', 'tswana',
       'filipino', 'mongolian', 'hausa', 'dari', 'azerbaijani', 'amharic',
       'albanian', 'assamese', 'tatar', 'macedonian', 'marathi',
       'malayalam', 'malay', 'maltese', 'khmer', 'german;',
       'hungarian,polish', 'english dutch',
       'traditional chinese (text only)', 'lang_slovakian']

EUROPEAN_LANGUAGES = ["German", "French", "Italian", 'Spanish - Spain', "Portuguese - Portugal"]

TOP_LANGUAGES = 10

percent_categories = 0.1


In [4]:
import pandas as pd
from datetime import datetime
import numpy as np

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from package.scripts.params import *

def clean_target(data:pd.DataFrame) -> pd.DataFrame :
    '''clean and cut the target'''
    data = data[data['Month'] != 'Last 30 Days']
    data['Month'] = pd.to_datetime(data['Month'])
    counts = data['App_ID'].value_counts()
    data = data[data['App_ID'].isin(counts[counts > 1].index)]
    data = data[(data['Month'] >= '2012-07-01') & (data['Month'] <= '2024-01-31')]

    return data

def only_last_month_v1_target(data:pd.DataFrame) -> pd.DataFrame :
    '''V1 : select only the last 2 month to predict the avg # of players'''

    data = data.groupby('App_ID',sort=False).last().reset_index()
    data.drop(columns='Month',inplace=True)
    return data

def transform_language_features(data_X: pd.DataFrame) -> pd.DataFrame:
    assert isinstance(data_X, pd.DataFrame)

    unique_languages = [lang.title() for lang in UNIQUE_LANGUAGE]
    # initialize lists to store languages and their proportions
    language_proportions = {}

    # calculate proportion for each language
    for lang in unique_languages:
        lang_count = (data_X['Supported_Languages'].str.contains(lang).sum()) / len(data_X)
        language_proportions[lang] = lang_count

    # order depending on language proportions
    sorted_languages = sorted(language_proportions.items(), key=lambda x: x[1], reverse=True)

    # define european language
    language_proportions['European'] = language_proportions['German'] + language_proportions['French'] + language_proportions['Italian'] + language_proportions['Spanish - Spain'] + language_proportions['Portuguese - Portugal']

    # Initialize top languages
    top_languages = []

    # iterate over the languages to add them to the list
    for lang, proportion in sorted_languages:
        if lang not in EUROPEAN_LANGUAGES:
            top_languages.append(lang)
        if len(top_languages) == TOP_LANGUAGES:
            break

    # make every language a colomn
    for lang in top_languages:
        if lang == 'European':  # Utilisez '==' pour la comparaison d'égalité, pas '='
            data_X[lang] = data_X['Supported_Languages'].str.contains("German|French|Italian|Spanish - Spain|Portuguese - Portugal", case=False, regex=True)
        else:
            data_X[lang] = data_X['Supported_Languages'].str.contains(lang, case=False, regex=True)

        data_X[lang] = data_X[lang].astype(int)

    data_X['other_lang'] = ~data_X['Supported_Languages'].str.contains('|'.join(top_languages), case=False, regex=True)
    data_X['other_lang'] = data_X['other_lang'].astype(int)

    data_X = data_X[data_X['English'] != 0]

    return data_X

def assign_category_developer(count:int)-> int:
    if count == 0:
        return "0"
    elif count == 1:
        return "1"
    elif count == 2:
        return "2"
    elif count == 3:
        return "3"
    elif count == 4:
        return "4"
    elif count == 5:
        return "5"
    elif 6 <= count <= 10:
        return "6"
    elif 11 <= count <= 20:
        return "7"
    else:  # Plus de 20
        return "8"

def assign_category_publisher(count:int)-> int:
    if count == 0:
        return "0"
    elif count == 1:
        return "1"
    elif count == 2:
        return "2"
    elif count == 3:
        return "3"
    elif count == 4:
        return "4"
    elif count == 5:
        return "5"
    elif 6 <= count <= 10:
        return "6"
    else:  # Plus de 10
        return "7"

def clean_data(data_X:pd.DataFrame,data_Y:pd.DataFrame) :
    '''clean the features before entering pipelines'''

    Y_clean = clean_target(data_Y)
    y = only_last_month_v1_target(Y_clean)

    data_X = data_X[FEATURE_SELECTION_V1]

    # consistent features - target
    data_X = data_X[data_X['App_ID'].isin(y['App_ID'])]
    data_X['Release_Date'] = pd.to_datetime(data_X['Release_Date'])
    
    data_X['day_sin'] = np.sin(2 * np.pi * data_X['Release_Date'].dt.dayofyear / days_in_year)
    data_X['day_cos'] = np.cos(2 * np.pi * data_X['Release_Date'].dt.dayofyear / days_in_year)
    data_X['month_sin'] = np.sin(2 * np.pi * data_X['Release_Date'].dt.month / months_in_year)
    data_X['month_cos'] = np.cos(2 * np.pi * data_X['Release_Date'].dt.month / months_in_year)
    data_X['year'] = data_X['Release_Date'].dt.year
    data_X.drop(columns="Release_Date",inplace=True)
    
    # keep only games with at least english language
    data_X = transform_language_features(data_X)

    #catégories pour dévelopers
    developer_counts = data_X['Developers'].groupby(data_X['Developers']).transform('count')
    data_X["dev_category"] = developer_counts.apply(assign_category_developer)
    data_X.drop(columns="Developers", inplace=True)

    #catégories pour publisher
    developer_counts = data_X['Publishers'].groupby(data_X['Publishers']).transform('count')
    data_X["publi_category"] = developer_counts.apply(assign_category_developer)
    data_X.drop(columns="Publishers", inplace=True)

    # transform support url with 1 if contains something, 0 otherwise
    data_X.Support_URL = data_X['Support_URL'].apply(lambda x: 0 if x!=x else 1)

    # encode bool values
    data_X.Windows = data_X.Windows.apply(lambda x: 1 if x==True else 0)
    data_X.Linux = data_X.Linux.apply(lambda x: 1 if x==True else 0)
    data_X.Mac = data_X.Mac.apply(lambda x: 1 if x==True else 0)

    # handle categorical columns before encoding
    data_X.Genres.fillna('No',inplace=True)
    data_X.Genres = data_X.Genres.apply(lambda x: ''.join(x).split(','))
    data_X.Categories.fillna('No', inplace=True)
    data_X.Categories = data_X.Categories.apply(lambda x: ''.join(x).split(','))
    # handle numerical columns before encoding
    data_X.loc[:, 'Achievements'] = data_X['Achievements'].fillna(0)

    # Compute Rating for Y_rating target
    data_X['TotalReviews'] = data_X['Positive'] + data_X['Negative']
    data_X['ReviewScore'] = data_X['Positive'] / data_X['TotalReviews']
    data_X['Rating'] = data_X['ReviewScore'] - (data_X['ReviewScore'] - 0.5) * 2 ** (- np.log10(data_X['TotalReviews']) + 1)
    data_X.Rating.fillna(0,inplace=True)

    Y_rating = data_X[['App_ID','Rating']]
    data_X.drop(columns=['TotalReviews', 'ReviewScore','Positive','Negative','Supported_Languages',"Rating"],inplace=True)

    data_X = data_X[data_X.Price != 'None']
    data_X.Price = data_X.Price.astype(dtype='float64')

    data_X.Achievements.replace('None',0,inplace=True)
    data_X.Achievements = data_X.Achievements.astype(dtype='int64')

    exploded_data = data_X.Genres.explode()
    one_hot_encoded_df = pd.get_dummies(exploded_data).groupby(level=0).sum()
    data_X = pd.concat([data_X,one_hot_encoded_df],axis=1)
    data_X.drop(columns='Genres',inplace=True)

    categories = data_X.Categories.explode().value_counts()/len(data_X.Categories) > percent_categories
    true_categories = categories[categories].index.tolist()
    data_X['autre_cat'] = data_X.Categories.apply(lambda x: [c for c in x if c not in true_categories])
    # Filtrer les catégories pour encoder seulement celles présentes dans true_categories
    data_X['Categories'] = data_X.Categories.apply(lambda x: [c for c in x if c in true_categories])
    # Encoder les catégories
    exploded_data = data_X.Categories.explode()
    one_hot_encoded_df = pd.get_dummies(exploded_data).groupby(level=0).sum()

    #mise en forme de "autre"
    data_X["autre_cat"] = data_X["autre_cat"].apply(lambda x: 1 if x else 0)

    # Ajouter les catégories encodées à data_X
    data_X = pd.concat([data_X, one_hot_encoded_df], axis=1)
    data_X.drop(columns=["Categories"],inplace = True)

    data_X.sort_values(by='App_ID',inplace=True)
    Y_rating.sort_values(by='App_ID',inplace=True)
    y.sort_values(by='App_ID',inplace=True)

    y = y[y['App_ID'].isin(data_X['App_ID'])]

    data_X.reset_index(drop=True,inplace=True)
    Y_rating.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)
    data_X.drop(columns="App_ID",inplace=True)
    Y_rating.drop(columns="App_ID",inplace=True)
    y.drop(columns="App_ID",inplace=True)

    return data_X, Y_rating, y

def full_preprocessor():
    """Create a pipeline to preprocess data"""

    # numerical features
    robust_features = ["Price", "Achievements"]
    # numerical pipeline
    scalers = ColumnTransformer([
        ("rob", RobustScaler(), robust_features), # Robust
    ], remainder="passthrough")

    numerical_pipeline = Pipeline([
        ("imputer", KNNImputer()),
        ("scalers", scalers)
    ])
    # categorical features
    ordinal_features = ["publi_category", "dev_category"]
    # categorical pipeline
    encoders = ColumnTransformer([
        ("ordinal",OrdinalEncoder(categories="auto", handle_unknown="use_encoded_value",unknown_value=-1)
         ,ordinal_features)
    ], remainder="passthrough")

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant")),
        ("encoders", encoders)
    ])
    # Full_preprocessor
    preprocessor = ColumnTransformer([
        ("num_pipeline", numerical_pipeline, make_column_selector(dtype_include="number")), # num_features # type: ignore
        ("cat_pipeline", categorical_pipeline, make_column_selector(dtype_exclude="number")) # cat_features # type: ignore
    ], remainder="passthrough").set_output(transform="pandas")


    return preprocessor


In [5]:
y=pd.read_csv("/home/clement/code/Fholklo/GameForecast/raw_data/player_history.csv")

In [6]:
data_X,y_rating,y2 = clean_data(df,y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Month'] = pd.to_datetime(data['Month'])
  lang_count = (data_X['Supported_Languages'].str.contains(lang).sum()) / len(data_X)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
preprocessor = full_preprocessor()

In [8]:
data_X_preprocess = preprocessor.fit_transform(data_X)

In [9]:
data_X_preprocess

Unnamed: 0,num_pipeline__rob__Price,num_pipeline__rob__Achievements,num_pipeline__remainder__Support_URL,num_pipeline__remainder__Windows,num_pipeline__remainder__Mac,num_pipeline__remainder__Linux,num_pipeline__remainder__day_sin,num_pipeline__remainder__day_cos,num_pipeline__remainder__month_sin,num_pipeline__remainder__month_cos,num_pipeline__remainder__year,num_pipeline__remainder__English,num_pipeline__remainder__Simplified Chinese,num_pipeline__remainder__Russian,num_pipeline__remainder__Japanese,num_pipeline__remainder__Korean,num_pipeline__remainder__Traditional Chinese,num_pipeline__remainder__Portuguese - Brazil,num_pipeline__remainder__Polish,num_pipeline__remainder__Turkish,num_pipeline__remainder__Spanish - Latin America,num_pipeline__remainder__other_lang,num_pipeline__remainder__Accounting,num_pipeline__remainder__Action,num_pipeline__remainder__Adventure,num_pipeline__remainder__Animation & Modeling,num_pipeline__remainder__Audio Production,num_pipeline__remainder__Casual,num_pipeline__remainder__Design & Illustration,num_pipeline__remainder__Early Access,num_pipeline__remainder__Education,num_pipeline__remainder__Free to Play,num_pipeline__remainder__Game Development,num_pipeline__remainder__Gore,num_pipeline__remainder__Indie,num_pipeline__remainder__Massively Multiplayer,num_pipeline__remainder__No,num_pipeline__remainder__Nudity,num_pipeline__remainder__Photo Editing,num_pipeline__remainder__RPG,num_pipeline__remainder__Racing,num_pipeline__remainder__Sexual Content,num_pipeline__remainder__Simulation,num_pipeline__remainder__Software Training,num_pipeline__remainder__Sports,num_pipeline__remainder__Strategy,num_pipeline__remainder__Utilities,num_pipeline__remainder__Video Production,num_pipeline__remainder__Violent,num_pipeline__remainder__Web Publishing,num_pipeline__remainder__autre_cat,num_pipeline__remainder__Co-op,num_pipeline__remainder__Family Sharing,num_pipeline__remainder__Full controller support,num_pipeline__remainder__Multi-player,num_pipeline__remainder__Online PvP,num_pipeline__remainder__Partial Controller Support,num_pipeline__remainder__PvP,num_pipeline__remainder__Remote Play Together,num_pipeline__remainder__Single-player,num_pipeline__remainder__Steam Achievements,num_pipeline__remainder__Steam Cloud,num_pipeline__remainder__Steam Leaderboards,num_pipeline__remainder__Steam Trading Cards,cat_pipeline__ordinal__publi_category,cat_pipeline__ordinal__dev_category
0,-0.141667,-0.486486,1.0,1.0,1.0,1.0,-0.851712,0.524010,-5.000000e-01,0.866025,2000.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0
1,-0.408333,-0.486486,0.0,1.0,1.0,1.0,0.999986,0.005376,8.660254e-01,-0.500000,1999.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0
2,-0.408333,-0.486486,0.0,1.0,1.0,1.0,0.872404,-0.488785,5.000000e-01,-0.866025,2003.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0
3,-0.408333,-0.486486,0.0,1.0,1.0,1.0,0.502791,-0.864408,1.224647e-16,-1.000000,2001.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0
4,-0.408333,-0.486486,1.0,1.0,1.0,1.0,-0.860600,0.509282,-5.000000e-01,0.866025,1999.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,7.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13930,-0.250000,0.648649,1.0,1.0,0.0,0.0,0.321058,0.947060,5.000000e-01,0.866025,2024.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,7.0,7.0
13931,-0.296667,-0.081081,0.0,1.0,0.0,0.0,0.337301,0.941397,5.000000e-01,0.866025,2024.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,2.0
13932,-0.599167,-0.351351,1.0,1.0,0.0,0.0,0.103031,0.994678,5.000000e-01,0.866025,2024.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
13933,-0.333333,0.297297,0.0,1.0,0.0,0.0,0.369484,0.929237,5.000000e-01,0.866025,2024.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
data_X_preprocess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13935 entries, 0 to 13934
Data columns (total 63 columns):
 #   Column                                               Non-Null Count  Dtype         
---  ------                                               --------------  -----         
 0   num_pipeline__rob__Price                             13935 non-null  float64       
 1   num_pipeline__rob__Achievements                      13935 non-null  float64       
 2   num_pipeline__remainder__App_ID                      13935 non-null  float64       
 3   num_pipeline__remainder__Support_URL                 13935 non-null  float64       
 4   num_pipeline__remainder__Windows                     13935 non-null  float64       
 5   num_pipeline__remainder__Mac                         13935 non-null  float64       
 6   num_pipeline__remainder__Linux                       13935 non-null  float64       
 7   num_pipeline__remainder__English                     13935 non-null  float64       
 

In [21]:
y2

Unnamed: 0,Avg. Players,Peak Players
0,34139.20,53967.0
1,98.17,183.0
2,331.84,552.0
3,8.61,21.0
4,79.06,137.0
...,...,...
13930,7.65,16.0
13931,9.89,32.0
13932,20.40,198.0
13933,26.90,78.0
