In [465]:
import pandas as pd
import os.path
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


data = pd.read_csv(os.getcwd() + '/steam_games.csv')

data = data.iloc[:5000, :]
print(len(data))
list(data.columns)



5000


['url',
 'types',
 'name',
 'desc_snippet',
 'recent_reviews',
 'all_reviews',
 'release_date',
 'developer',
 'publisher',
 'popular_tags',
 'game_details',
 'languages',
 'achievements',
 'genre',
 'game_description',
 'mature_content',
 'minimum_requirements',
 'recommended_requirements',
 'original_price',
 'discount_price']

In [466]:
# Remove all invalid games, non english games, and non games
data = data[data['name'].str.contains('', na=False)] 
data = data[data['types'].str.contains('app')]
data = data[data['languages'].str.contains('English', na=False)]


# Filtering popular_tags feature
data = data.dropna(subset=['popular_tags'])
# count the number of tags for each row
tag_count = data['popular_tags'].str.count(',')

# remove rows with 3 or less tags
data = data[tag_count > 3]

# Replace all null genres and tags with 'Other'
data['genre'] = data['genre'].fillna('Other')
data['popular_tags'] = data['popular_tags'].fillna('Other')
data['game_details'] = data['game_details'].fillna('Other')

# Replace all no review games with 'Mixed'
data['all_reviews'] = data['all_reviews'].fillna('Mixed')

# Change reviews from strings to numerical values
def check_tuple(tuple_value):
    if 'overhwelmingly positive' in tuple_value.lower():
        return 4
    elif 'very positive' in tuple_value.lower():
        return 3
    elif 'mostly positive' in tuple_value.lower():
        return 1
    elif 'mixed' in tuple_value.lower():
        return 0
    elif 'overwhelmingly negative' in tuple_value.lower():
        return -4
    elif 'very negative' in tuple_value.lower():
        return -3
    elif 'mostly negative' in tuple_value.lower():
        return -1
    elif 'negative' in tuple_value.lower():
        return -2
    elif 'positive' in tuple_value.lower():
        return 2
    else:
        return 0
      
def check_tuple2(tuple_value):
    if 'overwhelmingly positive' in tuple_value.lower():
        return 'Overwhelmingly Positive'
    elif 'very positive' in tuple_value.lower():
        return 'Very Positive'
    elif 'mostly positive' in tuple_value.lower():
        return 'Mostly Positive'
    elif 'mixed' in tuple_value.lower():
        return 'Mixed'
    elif 'overwhelmingly negative' in tuple_value.lower():
        return 'Overwhelmingly Negative'
    elif 'very negative' in tuple_value.lower():
        return 'Very Negative'
    elif 'mostly negative' in tuple_value.lower():
        return 'Mostly Negative'
    elif 'negative' in tuple_value.lower():
        return 'Negative'
    elif 'positive' in tuple_value.lower():
        return 'Positive'
    else:
        return 'No Data'
      
data['review_scores'] = data['all_reviews']
data['review_scores'] = data['review_scores'].apply(check_tuple)
data['all_reviews'] = data['all_reviews'].apply(check_tuple2)

# Remove 12 irrelevant features to the training set 
data.drop(['url', 'types', 'recent_reviews', 'achievements', 'original_price', 'recommended_requirements', 'discount_price', 'minimum_requirements', 'mature_content', 'game_description',
           'release_date', 'languages'], axis=1, inplace=True)

print(len(data))

4096


In [467]:
le = LabelEncoder()
data["popular_tags"] = data["popular_tags"].str.split(",")
data["genre"] = data["genre"].str.split(",")
data["game_details"] = data["game_details"].str.split(",")
data["encoded_publisher"]= le.fit_transform(data["publisher"])
data["encoded_developer"]= le.fit_transform(data["developer"])
data["encoded_reviews"]= le.fit_transform(data["all_reviews"])

len(data)

# Apply one-hot encoding to the Genre feature
mlb = MultiLabelBinarizer()
tags_encoded = pd.DataFrame(mlb.fit_transform(data["popular_tags"]), columns=mlb.classes_)
genre_encoded = pd.DataFrame(mlb.fit_transform(data["genre"]), columns=mlb.classes_)
details_encoded = pd.DataFrame(mlb.fit_transform(data["game_details"]), columns=mlb.classes_)

# Combine the encoded features with the original dataset
data = pd.concat([data, tags_encoded, genre_encoded, details_encoded], axis=1)

data.dropna(inplace=True)

X_training_dataset = data.copy()

X_training_dataset.drop(['encoded_publisher', 'encoded_developer', 'desc_snippet', 'all_reviews', 'developer', 'publisher', 'popular_tags', 'game_details', 'genre', 'review_scores'], axis=1, inplace=True)
print(data.iloc[350:400, data.columns.get_loc("name")])
#print(data["name"].head(300))


393                                       BATTALION 1944
394                                     Cities: Skylines
395                                            Stellaris
396                             American Truck Simulator
397                         Sid Meier’s Civilization® VI
398                              DARK SOULS™: REMASTERED
399                              Total War: WARHAMMER II
400                                       Hell Let Loose
401                                  Black Desert Online
403          The Elder Scrolls V: Skyrim Special Edition
404                                            Fallout 4
405                                                 Rust
406                                   Grand Theft Auto V
407                                             Warframe
408                                              MORDHAU
409                            The Witcher® 3: Wild Hunt
410                            Total War: THREE KINGDOMS
411                            

In [468]:
mygames = pd.DataFrame()
"""
ratings = [
    ("DayZ", 9.4),
    ("Grand Theft Auto V", 9.0),
    ("EVE Online", 6.0),
    ("TERA", 7.5),
    ("Call of Duty®: Black Ops", 9.0),
    ("Grand Theft Auto IV", 8.5),
    ("Portal", 2.0),
    ("PLAYERUNKNOWN'S BATTLEGROUNDS", 9.5),
    ("Total War Saga: THRONES OF BRITANNIA", 4.0),
    ("Battle Brothers", 8.5),
    ("The Sims™ 3", 2.0),
    ("The Elder Scrolls V: Skyrim VR", 8.0),
    ("Void Bastards", 1.0),
    ("Fishing Planet", 1.5),
    ("SPORE™", 2.0),
    ("FINAL FANTASY X/X-2 HD Remaster", 6.7),
    ("Dungeon Munchies", 2.5),
    ("Assassin's Creed® Origins", 7.0),
    ("Bloons TD 6", 2.5),
    ("MapleStory 2", 6.0),
    ("Arma 3", 10.0),
    ("Call of Duty®: Modern Warfare® 2", 9.5),
    ("DCS World Steam Edition", 3.0),
    ("Black Desert Online", 9.5),
    ("Rust", 10.0),
    ("Mortal Online", 10.0),
    ("SCUM", 9.0),
    ("UNO", 1.0),
    ("Conan Exiles", 9.5),
    ("Plants vs. Zombies GOTY Edition", 1.5),
    ("Miscreated", 9.5),
]

games = [game for game, rating in ratings]
ratings = [rating for game, rating in ratings]

mygames = X_training_dataset.loc[X_training_dataset["name"].isin(games)]

mygames['ratings'] = ratings
"""
row1 = X_training_dataset.loc[X_training_dataset["name"] == "DayZ"]
row2 = X_training_dataset.loc[X_training_dataset["name"] == "Grand Theft Auto V"]
row3 = X_training_dataset.loc[X_training_dataset["name"] == "EVE Online"]
row4 = X_training_dataset.loc[X_training_dataset["name"] == "TERA"]
row5 = X_training_dataset.loc[X_training_dataset["name"] == "Call of Duty®: Black Ops"]
row6 = X_training_dataset.loc[X_training_dataset["name"] == "Grand Theft Auto IV"]
row7 = X_training_dataset.loc[X_training_dataset["name"] == "Portal"]
row8 = X_training_dataset.loc[X_training_dataset["name"] == "PLAYERUNKNOWN'S BATTLEGROUNDS"]
row9 = X_training_dataset.loc[X_training_dataset["name"] == "Total War Saga: THRONES OF BRITANNIA"]
row10 = X_training_dataset.loc[X_training_dataset["name"] == "Battle Brothers"]
row11 = X_training_dataset.loc[X_training_dataset["name"] == "The Sims™ 3"]
row12 = X_training_dataset.loc[X_training_dataset["name"] == "The Elder Scrolls V: Skyrim VR"]
row13 = X_training_dataset.loc[X_training_dataset["name"] == "Void Bastards"]
row14 = X_training_dataset.loc[X_training_dataset["name"] == "Fishing Planet"]
row15 = X_training_dataset.loc[X_training_dataset["name"] == "SPORE™"]
row16 = X_training_dataset.loc[X_training_dataset["name"] == "FINAL FANTASY X/X-2 HD Remaster"]
row17 = X_training_dataset.loc[X_training_dataset["name"] == "Dungeon Munchies"]
row18 = X_training_dataset.loc[X_training_dataset["name"] == "Assassin's Creed® Origins"]
row19 = X_training_dataset.loc[X_training_dataset["name"] == "Bloons TD 6"]
row20 = X_training_dataset.loc[X_training_dataset["name"] == "MapleStory 2"]
row21 = X_training_dataset.loc[X_training_dataset["name"] == "Arma 3"]
row22 = X_training_dataset.loc[X_training_dataset["name"] == "Call of Duty®: Modern Warfare® 2"]
row23 = X_training_dataset.loc[X_training_dataset["name"] == "DCS World Steam Edition"]
row24 = X_training_dataset.loc[X_training_dataset["name"] == "Black Desert Online"]
row25 = X_training_dataset.loc[X_training_dataset["name"] == "Rust"]
row26 = X_training_dataset.loc[X_training_dataset["name"] == "Mortal Online"]
row27 = X_training_dataset.loc[X_training_dataset["name"] == "SCUM"]
row28 = X_training_dataset.loc[X_training_dataset["name"] == "UNO"]
row29 = X_training_dataset.loc[X_training_dataset["name"] == "Conan Exiles"]
row30 = X_training_dataset.loc[X_training_dataset["name"] == "Plants vs. Zombies GOTY Edition"]
row31 = X_training_dataset.loc[X_training_dataset["name"] == "Miscreated"]
row32 = X_training_dataset.loc[X_training_dataset["name"] == "Counter-Strike: Source"]
row33 = X_training_dataset.loc[X_training_dataset["name"] == "Puzzle Pirates"]



ratings = [9.4, 9.0, 6.0, 7.5, 9.0, 8.5, 2.0, 9.5, 4.0, 8.5, 2.0, 8.0, 1.0, 1.5, 2.0, 6.7, 2.5, 7.0, 2.5, 6.0, 10, 9.5, 3.0, 9.5, 10.0, 10.0, 9.0, 1.0, 9.5, 1.5, 9.5, 9.0, 1.0]
#ratings = [9.4, 8.0, 6.0, 7.5, 8.0, 8.5, 5.0, 8.5, 6.0, 6.5, 2.0]
#mygames = pd.concat([row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, ])

mygames = pd.concat([row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, row16, row17, row18, row19, row20, row21, row22, row23, row24, row25, row26, row27, row28, row29, row30, row31, row32, row33])

mygames['ratings'] = ratings

#mygames.to_csv('mygames2.csv', index=False)
#print(mygames)

In [469]:
#Training linear regression model
X = mygames.drop(['name', 'ratings'], axis = 1)
y = mygames['ratings']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model object
model = Ridge()

# Train the model using the training set
model.fit(X_train, y_train)

# Evaluate the model's performance on the testing set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")

"""
#Training linear regression model
X = mygames.drop(['name', 'ratings'], axis = 1)
y = mygames['ratings']

# Create a linear regression model
model = LinearRegression()

# Train the model on the dataset
model.fit(X, y)

ratings = model.predict(X_training_dataset.drop('name', axis = 1))

sorted = pd.DataFrame({
    'name': X_training_dataset['name'],
    'tags': data['popular_tags'],
    'rating': ratings
})
sorted = sorted.sort_values(by='rating', ascending=False)
print(sorted.head(10))
"""

Mean Squared Error: 16.2208
Mean Absolute Error: 3.0512
R_2 score: -4.6394


"\n#Training linear regression model\nX = mygames.drop(['name', 'ratings'], axis = 1)\ny = mygames['ratings']\n\n# Create a linear regression model\nmodel = LinearRegression()\n\n# Train the model on the dataset\nmodel.fit(X, y)\n\nratings = model.predict(X_training_dataset.drop('name', axis = 1))\n\nsorted = pd.DataFrame({\n    'name': X_training_dataset['name'],\n    'tags': data['popular_tags'],\n    'rating': ratings\n})\nsorted = sorted.sort_values(by='rating', ascending=False)\nprint(sorted.head(10))\n"