# Import libraries

In [None]:
import requests
import pandas as pd
import pymongo

# Setup MongoDB client and prepare it for storing requests

In [None]:
mc = pymongo.MongoClient()

In [None]:
db = mc['games']


In [None]:
ig_data = db['ig_data']

In [None]:
ig_data.count_documents({})

# Setup api key from local file in a python environment

In [None]:
with open('.secrets/igdb-api-key') as f:
    api_key = f.read().strip()


# Prepare variables. url, api_key, and years 2007-2018 in unix format

In [None]:
url = "https://api-v3.igdb.com/games"
headers = {'user-key': api_key}

dates = [[1167609600, 1199059200], [1199145600,1230681600], [1230768000,1262217600],
        [1262304000,1293753600], [1293840000,1325289600], [1325376000, 1356912000],
        [1356998400, 1388448000], [1388534400, 1419984000], [1420070400, 1451520000],
        [1451606400, 1483142400], [1485820800,1514678400], [1514764800,1546214400]]

Sends a query for top 50 most popular games, and then repeat 4 times for each year.
The resluts would be the top 200 games in each year.

In [None]:

for year in dates:
    offsets = [0, 50, 100, 150]
    for offset in offsets:
        query = {'fields': f"""*; \
              where themes != (42) & first_release_date > {year[0]} & first_release_date < {year[1]}; \
              sort popularity desc; \
              limit: 50; \
              offset: {offset}"""}
        result = requests.get(url, headers=headers, params=query)
        ig_data.insert_many(result.json())

# Load dataframe and clean it from null resluts. And using only two features, generes and themes.

In [None]:
df = pd.DataFrame(ig_data.find())

In [None]:
clean_df = df[['genres', 'themes']]

In [None]:
clean_df.dropna(inplace=True)

In [None]:
def change_genre(values):
    new_list = []
    for i in values:
        new_list.append('genre' + str(i))
    return new_list
def change_theme(values):
    new_list = []
    for i in values:
        new_list.append('theme' + str(i))
    return new_list

The default values are stored as lists of numbers, such as [1,3,6,4] each number represents a genre or theme code. It will convert to strings of ['genre1', 'genre4'] for use in one hot encoding later

In [None]:
clean_df.genres = clean_df.genres.apply(change_genre)
clean_df.themes = clean_df.themes.apply(change_theme)

In [None]:
clean_df.head()

# Visualize the timeline of releas dates

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
df_release = pd.to_datetime(df['first_release_date'], origin='unix', unit='s')

In [None]:
plt.hist(df_release, bins=150);

# Split the dataset for testing, and one hot encode the features

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(clean_df['themes'], clean_df['genres'])

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_genres = MultiLabelBinarizer()
mlb_themes = MultiLabelBinarizer()


y_train_binarizered = pd.DataFrame(mlb_genres.fit_transform(y_train), columns=mlb_genres.classes_)
X_train_binarizered = pd.DataFrame(mlb_themes.fit_transform(X_train), columns=mlb_themes.classes_)

# Fit RandomForest on traindata and make a prediction on test data

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train_binarizered, y_train_binarizered)

In [None]:
y_test_binarizered = pd.DataFrame(mlb_genres.transform(y_test), columns=mlb_genres.classes_)
X_test_binarizered = pd.DataFrame(mlb_themes.transform(X_test), columns=mlb_themes.classes_)

In [None]:
rfc.predict(X_test_binarizered)

# See results of the model built

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test_binarizered, rfc.predict(X_test_binarizered)))