In [None]:
#import libraries/modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
#set dataframe visualization optio ]ns
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 100)

Part 1 - Number of Owners

In [None]:
#read csv
df1 = pd.read_csv("data_with_codes.csv")
df1.head(1)

In [None]:
#set appid as index
df1.set_index('steam_appid', inplace=True)

In [None]:
#select columns with object and drop them
object_columns = df1.select_dtypes(include='object').columns
df1.drop(columns = object_columns, inplace = True)

In [None]:
#select features for use in decidion tree
x = df1.drop(columns=["owners_code","total_recommendations", "total_user_reviews", "median_playtime", "average_playtime", "reviews_proportion_negative", "reviews_proportion_positive_bin_code"])

In [None]:
#select class column
y = df1["owners_code"]

In [None]:
#split traing and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, stratify=y)

In [None]:
#tree model fitted to training data
clf = DecisionTreeClassifier(criterion="entropy", random_state=1)
clf.fit(x_train, y_train)

In [None]:
#get list of feature names and class names
feature_names = list(x_train.columns)
class_names = y_train.unique()
lista = []

for i in class_names:
    i = str(i)
    lista.append(i)

In [None]:
#plot tree
plt.figure(figsize=(60, 60))
plot_tree(clf, feature_names=feature_names, class_names=lista, filled=True)
plt.show()

In [None]:
#export tree as text
tree_text = export_text(clf, feature_names=feature_names)
file_path = 'decision_tree_owners.txt'
with open(file_path, 'w') as file:
    file.write(tree_text)

In [None]:
#get feature imoritances and sort them in descending order with most important first
importances = clf.feature_importances_
indices = importances.argsort()[::-1]
sorted_features = [feature_names[i] for i in indices]
sorted_importances = importances[indices]

In [None]:
plt.figure(figsize=(60, 60))
plt.bar(range(len(importances)), sorted_importances, tick_label=sorted_features)
plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()

In [None]:
#print top 10 most important features
for i in range(10):
    print(f"{sorted_features[i]}: {sorted_importances[i]}")

In [None]:
#get accuracy of predictions
y_predict = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
accuracy

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(clf, x, y, cv=stratified_kfold, scoring='accuracy')
scores.mean(), scores.std()

In [None]:
#Create Grid of Hyperparemeters
grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10] }

In [None]:
#Fit the model with different configurations using the grid search and a crossvalidation of 5
grid_search = GridSearchCV(clf, grid, cv=5)
grid_search.fit(x_train, y_train)

In [None]:
#get best parameters
best_parameters = grid_search.best_params_
best_parameters

In [None]:
#fit model with best params 
pruned_dt = DecisionTreeClassifier(criterion="entropy", max_depth= 5, min_samples_leaf= 5, min_samples_split= 2, random_state=1)
pruned_dt.fit(x_train, y_train)

In [None]:
#plot model
plt.figure(figsize=(60, 60))
plot_tree(pruned_dt, feature_names=feature_names, class_names=lista, filled=True)
plt.show()

In [None]:
#export tree text
tree_text = export_text(pruned_dt, feature_names=feature_names)
file_path = 'decision_tree_owners_pruned.txt'
with open(file_path, 'w') as file:
    file.write(tree_text)


In [None]:
#get and sort feature importance
importances2 = pruned_dt.feature_importances_
indices2 = importances2.argsort()[::-1]
sorted_features2 = [feature_names[i] for i in indices2]
sorted_importances2 = importances2[indices2]

In [None]:
#plot importances
plt.figure(figsize=(60, 60))
plt.bar(range(len(importances2)), sorted_importances2, tick_label=sorted_features2)
plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()

In [None]:
#print top 10
for i in range(10):
    print(f"{sorted_features2[i]}: {sorted_importances2[i]}")

In [None]:
#get model accuracy
y_predict = pruned_dt.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)
accuracy

In [None]:
#get model accuracy crossvalidation score
scores = cross_val_score(pruned_dt, x, y, cv=stratified_kfold, scoring='accuracy')
scores.mean(), scores.std()

Part 2 - Proprotion of Positive Ratings

In [None]:
#df with selected features
x2 = df1.drop(columns=["reviews_proportion_positive","reviews_proportion_positive_bin_code", "reviews_proportion_negative", "total_recommendations", "proportion_recommended"])

In [None]:
#df with slected class
y2 = df1["reviews_proportion_positive_bin_code"]

In [None]:
#training and testing data split
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.3, random_state=1, stratify=y2)

In [None]:
#fit tree to training data
clf2 = DecisionTreeClassifier( criterion="entropy", random_state=1)
clf2.fit(x_train2, y_train2)

In [None]:
#get list of feature names and class names
feature_names2 = list(x_train2.columns)
class_names2 = y_train2.unique()
lista2 = []

for i in class_names2:
    i = str(i)
    lista2.append(i)

In [None]:
#plot tree
plt.figure(figsize=(60, 60))
plot_tree(clf2, feature_names=feature_names2, class_names=lista2, filled=True)
plt.show()

In [None]:
#export tree text
tree_text2 = export_text(clf2, feature_names=feature_names2)
file_path = 'decision_tree_rating.txt'
with open(file_path, 'w') as file:
    file.write(tree_text)

In [None]:
#get feature imoritances and sort them in descending order with most important first
importances3 = clf2.feature_importances_
indices3 = importances3.argsort()[::-1]
sorted_features3 = [feature_names2[i] for i in indices3]
sorted_importances3 = importances3[indices3]

In [None]:
#plot importances
plt.figure(figsize=(60, 60))
plt.bar(range(len(importances3)), sorted_importances3, tick_label=sorted_features3)
plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()

In [None]:
#print top 10
for i in range(10):
    print(f"{sorted_features3[i]}: {sorted_importances3[i]}")

In [None]:
#get accuracy
y_predict2 = clf2.predict(x_test2)
accuracy = accuracy_score(y_test2, y_predict2)
accuracy

In [None]:
#cross-validate accuracy
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(clf2, x2, y2, cv=stratified_kfold, scoring='accuracy')
scores.mean(), scores.std()

In [None]:
#hyperparameter grid
grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10] }

In [None]:
#grid search
grid_search = GridSearchCV(clf2, grid, cv=5)
grid_search.fit(x_train2, y_train2)

In [None]:
#get best parameters from grid search
best_parameters = grid_search.best_params_
best_parameters

In [None]:
#fit tree with best paramenters to the training data
pruned_dt2 = DecisionTreeClassifier(criterion="entropy", **best_parameters, random_state=1)
pruned_dt2.fit(x_train2, y_train2)

In [None]:
#plot tree
plt.figure(figsize=(60, 60))
plot_tree(pruned_dt2, feature_names=feature_names2, class_names=lista2, filled=True)
plt.show()

In [None]:
#export tree text
tree_text2 = export_text(clf2, feature_names=feature_names2)
file_path = 'decision_tree_rating_pruned.txt'
with open(file_path, 'w') as file:
    file.write(tree_text)

In [None]:
#get feature imoritances and sort them in descending order with most important first
importances4 = pruned_dt2.feature_importances_
indices4 = importances4.argsort()[::-1]
sorted_features4 = [feature_names2[i] for i in indices4]
sorted_importances4 = importances4[indices4]

In [None]:
#plot importances
plt.figure(figsize=(60, 60))
plt.bar(range(len(importances4)), sorted_importances4, tick_label=sorted_features4)
plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.show()

In [None]:
#print top 10
for i in range(10):
    print(f"{sorted_features4[i]}: {sorted_importances4[i]}")

In [None]:
#get accuracy
y_predict2 = pruned_dt2.predict(x_test2)
accuracy = accuracy_score(y_test2, y_predict2)
accuracy

In [None]:
#cross-validate accuracy
scores = cross_val_score(pruned_dt2, x2, y2, cv=stratified_kfold, scoring='accuracy')
scores.mean(), scores.std()