In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [3]:
data = pd.read_csv('../scraping/all_data_batch1_30to60k')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29959 entries, 0 to 29958
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               29959 non-null  object 
 1   release_date        29959 non-null  object 
 2   plays               29959 non-null  object 
 3   playing             29959 non-null  object 
 4   backlogs            29959 non-null  object 
 5   wishlist            29959 non-null  object 
 6   developers          29959 non-null  object 
 7   avg_review          29959 non-null  float64
 8   genres              29959 non-null  object 
 9   platforms           29959 non-null  object 
 10  description         29134 non-null  object 
 11  total_reviews       29959 non-null  object 
 12  total_lists         29959 non-null  object 
 13  category            29959 non-null  object 
 14  main                29959 non-null  object 
 15  ratings_zero_five   29959 non-null  int64  
 16  rati

In [5]:
genre_df = data[['title', 'genres']]
developer_df = data[['title', 'developers']]
platform_df = data[['title', 'platforms']]

In [6]:
genre_df

Unnamed: 0,title,genres
0,Soul Axiom Rebooted,"['Adventure', 'Indie', 'Puzzle']"
1,Panic Station VR,['Indie']
2,LoveR Kiss,"['Adventure', 'Visual Novel']"
3,Boxman's Struggle,"['Adventure', 'Indie', 'Platform']"
4,Kemono Heroes,"['Adventure', 'Indie', 'Platform']"
...,...,...
29954,Distraint,"['Adventure', 'Indie', 'Puzzle']"
29955,Murder,"['Adventure', 'Indie', 'Point-and-Click', 'Puz..."
29956,Yo-Kai Watch Puni-puni,"['Puzzle', 'RPG']"
29957,Playing History - The Plague,"['Adventure', 'Indie']"


In [7]:
row = data['genres'][0]

In [8]:
def make_stringlist_list(string):
    '''This removes square brackets, and splits the string by comma to the create a list'''
    list_of_strings = string[2:-2].replace("'", '').split(',')
    return list_of_strings

def remove_whitespace(list_):
    '''This removes whitesapces from items within lists'''
    empty = []
    
    for i in range(len(list_)):
        item = list_[i].strip()
        empty.append(item)
    return empty

def clean_stringlists(df):
    '''For lists that were imported as strings, this removes square brackers and cleans up trailing whitespaces'''
    string_to_list_df = df.apply(make_stringlist_list)
    remove_whitespace_df = string_to_list_df.apply(remove_whitespace)
    
    return remove_whitespace_df

In [9]:
def make_list_columns_to_lists(df, columns):
    '''This returns a dataframe of columns where lists where imported as strings, and returns them to their list state'''
    cleaned_df = pd.DataFrame()
    
    for col in columns:
        cleaned = clean_stringlists(df[col])
        cleaned_df[col] = cleaned
    return cleaned_df

In [10]:
updated_df = make_list_columns_to_lists(data, ['genres', 'platforms', 'developers'])

In [11]:
updated_df

Unnamed: 0,genres,platforms,developers
0,"[Adventure, Indie, Puzzle]","[Windows PC, Mac, Nintendo Switch]",[Wales Interactive]
1,[Indie],[Windows PC],[]
2,"[Adventure, Visual Novel]","[PlayStation 4, Nintendo Switch]",[Kadokawa Games]
3,"[Adventure, Indie, Platform]",[Windows PC],[OneBlock]
4,"[Adventure, Indie, Platform]","[Nintendo Switch, Google Stadia]","[NIS America, MadGearGames]"
...,...,...,...
29954,"[Adventure, Indie, Puzzle]",[Windows PC],[Jesse Makkonen]
29955,"[Adventure, Indie, Point-and-Click, Puzzle, St...","[Windows PC, Mac, Linux]","[Peter Moorhead, Curve Digital]"
29956,"[Puzzle, RPG]","[Android, iOS]",[]
29957,"[Adventure, Indie]","[Windows PC, Mac]",[Serious Games Interactive]


In [12]:
mlb_genre = MultiLabelBinarizer()
mlb_platform = MultiLabelBinarizer()
mlb_developer = MultiLabelBinarizer()


transfomed_genre = mlb_genre.fit_transform(updated_df['genres'])
transformed_platform = mlb_platform.fit_transform(updated_df['platform'])
transfomed_developer = mlb_developer.fit_transform(updated_df['developer'])

In [14]:
genre_encoded = pd.DataFrame(transfomed, columns=mlb.classes_)

In [16]:
def categorical_encoder(category):
    mlb = MultiLabelBinarizer()
    transformed = mlb.fit_transform(category)
    
    df = pd.DataFrame(transformed, columns=mlb.classes_)
    
    return df, mlb


In [22]:
df, mlb_thing = categorical_encoder(updated_df['developers'])

In [24]:
df.shape

(29959, 10255)

In [15]:
genre_encoded

Unnamed: 0,Unnamed: 1,Adventure,Arcade,Brawler,Card & Board Game,Fighting,Indie,MOBA,Music,Pinball,...,RPG,Racing,Real Time Strategy,Shooter,Simulator,Sport,Strategy,Tactical,Turn Based Strategy,Visual Novel
0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29954,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29955,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
29956,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
29957,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
