# Beyond The Crosshair

In [19]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt

## Datasets

### Counter-Strike Playerbase Data from SteamDB

In [46]:
cs_data = pd.read_csv("../assets/SteamDB Counter-Strike.csv")
cs2_data = pd.read_csv("../assets/SteamDB Counter-Strike 2.csv")
css_data = pd.read_csv("../assets/SteamDB Counter-Strike Source.csv")
cscz_data = pd.read_csv("../assets/SteamDB Counter-Strike Condition Zero.csv")

In [47]:
cs_data_with_dash = cs_data[cs_data["gain"] == "-"]
cs_data_with_dash

Unnamed: 0,month,peak,gain,%gain,average,average % gain
241,Jan-04,23241,-,-,-,-


In [48]:
cs_data[['peak', 'gain', 'average']] = cs_data[['peak', 'gain', 'average']].replace({",": "", "-": "0"}, regex=True).astype(float)
cs2_data[['peak', 'gain', 'average']] = cs2_data[['peak', 'gain', 'average']].replace({",": "", "-": "0"}, regex=True).astype(float)
css_data[['peak', 'gain', 'average']] = css_data[['peak', 'gain', 'average']].replace({",": "", "-": "0"}, regex=True).astype(float)
cscz_data[['peak', 'gain', 'average']] = cscz_data[['peak', 'gain', 'average']].replace({",": "", "-": "0"}, regex=True).astype(float)

In [49]:
fields_game_versions = ['Counter-Strike', 'Counter-Strike 2', 'Counter-Strike Source', 'Counter-Strike Condition Zero']
fields_to_keep = ['month', 'peak', 'gain', 'average'] + fields_game_versions
cs_data['Counter-Strike'] = 'yes'
cs2_data['Counter-Strike 2'] = 'yes'
css_data['Counter-Strike Source'] = 'yes'
cscz_data['Counter-Strike Condition Zero'] = 'yes'
cs_merged = pd.concat([cs_data.iloc[1:], cs2_data.iloc[1:], css_data.iloc[1:], cscz_data.iloc[1:]], ignore_index=True)
cs_merged = cs_merged[fields_to_keep]
cs_merged.head()

Unnamed: 0,month,peak,gain,average,Counter-Strike,Counter-Strike 2,Counter-Strike Source,Counter-Strike Condition Zero
0,Aug-25,12680.0,1946.0,7007.0,yes,,,
1,Jul-25,14626.0,1104.0,7412.0,yes,,,
2,Jun-25,15730.0,81.0,8251.0,yes,,,
3,May-25,15811.0,2394.0,9224.0,yes,,,
4,Apr-25,18205.0,430.0,9653.0,yes,,,


In [50]:
SteamDB_merged_grouped = cs_merged.groupby(cs_merged.columns[0]).sum()
SteamDB_merged_grouped[fields_game_versions] = SteamDB_merged_grouped[fields_game_versions].replace(0, "no")
SteamDB_merged_grouped

Unnamed: 0_level_0,peak,gain,average,Counter-Strike,Counter-Strike 2,Counter-Strike Source,Counter-Strike Condition Zero
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Apr-04,84182.0,35217.0,0.0,yes,no,no,yes
Apr-05,144177.0,24782.0,0.0,yes,no,yes,yes
Apr-07,131302.0,114236.0,0.0,yes,no,yes,yes
Apr-09,115364.0,64362.0,0.0,yes,no,yes,yes
Apr-10,167923.0,8719.0,0.0,yes,no,yes,yes
...,...,...,...,...,...,...,...
Sep-20,1003755.0,49306.0,0.0,yes,yes,yes,yes
Sep-21,970788.0,146610.0,0.0,yes,yes,yes,yes
Sep-22,1124191.0,63558.0,0.0,yes,yes,yes,yes
Sep-23,1580157.0,23118.0,986669.0,yes,yes,yes,yes


In [51]:
SteamDB_merged_grouped.to_csv("../assets/SteamDB_merged_cleaned.csv",encoding="utf-8")

### Rival Game Data from IGDB

In [None]:
game_data = pd.read_csv("../assets/igdb_shooters.csv")

# splitting all genres csv into separate csv files

# Replace this with the path to your big CSV
input_file = "../assets/igdb_games.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(input_file)

# Calculate the size of each split
n = len(df)
split_size = n // 3

# Split into 3 parts
df1 = df.iloc[:split_size]
df2 = df.iloc[split_size:2*split_size]
df3 = df.iloc[2*split_size:]

# Save to separate CSV files
df1.to_csv("../assets/igdb_part1.csv", index=False)
df2.to_csv("../assets/igdb_part2.csv", index=False)
df3.to_csv("../assets/igdb_part3.csv", index=False)

print("Splitting complete! Files saved as part1.csv, part2.csv, part3.csv")

In [71]:
igdb_data_1 = pd.read_csv("../assets/igdb_part1.csv")
igdb_data_2 = pd.read_csv("../assets/igdb_part2.csv")
igdb_data_3 = pd.read_csv("../assets/igdb_part3.csv")
igdb_data_merged = pd.concat([igdb_data_1, igdb_data_2, igdb_data_3], ignore_index=True)

  igdb_data_1 = pd.read_csv("../assets/igdb_part1.csv")
  igdb_data_2 = pd.read_csv("../assets/igdb_part2.csv")
  igdb_data_3 = pd.read_csv("../assets/igdb_part3.csv")


In [72]:
IGDB_fields = ['id', 'name', 'first_release_date', 'genres', 'rating', 'rating_count', 'total_rating', 'total_rating_count', 'aggregated_rating',
       'aggregated_rating_count']

igdb_data_merged_filtered = igdb_data_merged[IGDB_fields]
igdb_data_merged_filtered["first_release_date"] = pd.to_datetime(igdb_data_merged_filtered["first_release_date"], unit="s") # Formatting date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  igdb_data_merged_filtered["first_release_date"] = pd.to_datetime(igdb_data_merged_filtered["first_release_date"], unit="s")


In [128]:
vanilla_wow = igdb_data_merged_filtered[igdb_data_merged_filtered['id']==123] # Vanilla WoW
cata_wow = igdb_data_merged_filtered[igdb_data_merged_filtered['id']==229] # Cataclysm
cata_wow

Unnamed: 0,id,name,first_release_date,genres,rating,rating_count,total_rating,total_rating_count,aggregated_rating,aggregated_rating_count
327330,229,World of Warcraft: Cataclysm,2010-12-07,[12],76.861351,198.0,83.097342,201.0,89.333333,3.0


In [143]:
import json
with open("../data/genres.json", "r", encoding="utf-8") as f:
    genres_list = json.load(f)
genres_list = {int(item['id']): item["name"] for item in genres_list}
with open("../data/genres.json", "w", encoding="utf-8") as f:
    json.dump(genres_list, f, indent=4, ensure_ascii=False)

genres_list

{31: 'Adventure',
 33: 'Arcade',
 35: 'Card & Board Game',
 4: 'Fighting',
 25: "Hack and slash/Beat 'em up",
 32: 'Indie',
 36: 'MOBA',
 7: 'Music',
 30: 'Pinball',
 8: 'Platform',
 2: 'Point-and-click',
 9: 'Puzzle',
 26: 'Quiz/Trivia',
 10: 'Racing',
 11: 'Real Time Strategy (RTS)',
 12: 'Role-playing (RPG)',
 5: 'Shooter',
 13: 'Simulator',
 14: 'Sport',
 15: 'Strategy',
 24: 'Tactical',
 16: 'Turn-based strategy (TBS)',
 34: 'Visual Novel'}

In [154]:
import ast
igdb_data_merged_filtered_converted = igdb_data_merged_filtered.copy()
igdb_data_merged_filtered_converted['genres'] = igdb_data_merged_filtered['genres'].map(lambda x: [] if pd.isna(x) else[genres_list[g] for g in x] if isinstance(x, list) else [genres_list[g] for g in ast.literal_eval(x)])
igdb_data_merged_filtered_converted.head()

Unnamed: 0,id,name,first_release_date,genres,rating,rating_count,total_rating,total_rating_count,aggregated_rating,aggregated_rating_count
0,165499,^_^,2012-01-14,[Adventure],,,,,,
1,141830,||[}}}°.•°.°•..°•°[|||{{{,2020-02-02,[Indie],,,,,,
2,191680,_____,2014-12-11,"[Shooter, Indie]",,,,,,
3,176875,__________,2018-07-05,"[Adventure, Indie]",,,,,,
4,92204,_-_,2018-03-14,"[Puzzle, Arcade]",90.0,0.0,90.0,0.0,,


In [156]:
igdb_data_merged_filtered_converted.to_csv("../assets/IGDB_merged_cleaned.csv", index=False)

In [131]:
testx = cata_wow['genres']
type(testx.iloc[0][0])

if isinstance(genres_list, list):
    genres_dict = {int(item["id"]): item["name"] for item in genres_list}
else:  # dict keyed by id strings
    genres_dict = {int(k): v["name"] for k, v in genres_list.items()}

# --- parser that handles list/int/str/NaN safely ---
def parse_genre_ids(val):
    if pd.isna(val):
        return []
    # convert to python object
    try:
        obj = ast.literal_eval(str(val))
    except Exception:
        # not a literal (already a list? or plain str "12") – try to coerce
        obj = val

    # normalize to list of ints
    if isinstance(obj, (list, tuple, set)):
        ids = []
        for g in obj:
            try:
                ids.append(int(g))
            except Exception:
                continue
    elif isinstance(obj, (int, str)):
        try:
            ids = [int(obj)]
        except Exception:
            ids = []
    else:
        ids = []

    # map to names (skip unknown ids)
    return [genres_dict[i] for i in ids if i in genres_dict]

# --- apply to your DataFrame column ---
# assumes cata_wow['genres'] contains things like "[12, 14]" or 12, or NaN
cata_wow = cata_wow.copy()  # ensure it's not a view

cata_wow.loc[:, "genres_names"] = cata_wow["genres"].map(parse_genre_ids)
cata_wow

Unnamed: 0,id,name,first_release_date,genres,rating,rating_count,total_rating,total_rating_count,aggregated_rating,aggregated_rating_count,genres_names
327330,229,World of Warcraft: Cataclysm,2010-12-07,[12],76.861351,198.0,83.097342,201.0,89.333333,3.0,[]


### Twitch Viewership Data from TwitchTracker

In [25]:
cs_twitch_data = pd.read_csv("../assets/TwitchTracker Counter-Strike.csv")
cs2_twitch_data = pd.read_csv("../assets/TwitchTracker Counter-Strike 2.csv")
cscz_twitch_data = pd.read_csv("../assets/TwitchTracker Counter-Strike Condition Zero.csv")
css_twitch_data = pd.read_csv("../assets/TwitchTracker Counter-Strike Source.csv")

twitch_fields = ['Month', 'Avg Viewers', 'Gain', 'Peak Viewers', 'Avg Streams', 'Gain.1', 'Peak Streams', 'Hours Watched']

In [34]:
twitch_data_merged = pd.concat([cs_twitch_data, cs2_twitch_data, cscz_twitch_data, css_twitch_data], ignore_index=True)
twitch_data_merged = twitch_data_merged[twitch_fields]
twitch_data_merged

Unnamed: 0,Month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
0,Nov-16,18.0,-,164,2.0,-,7.0,3.2K
1,Dec-16,20.0,2,509,2.0,-,10.0,8.0K
2,Jan-17,19.0,-1,543,4.0,2,13.0,8.8K
3,Feb-17,30.0,11,5782,2.0,-2,11.0,16.0K
4,Mar-17,18.0,-12,815,2.0,-,10.0,6.9K
...,...,...,...,...,...,...,...,...
316,Jan-17,18,-1,215,1,-,3,357
317,Dec-16,19,4,216,1,-,3,946
318,Nov-16,15,-,147,1,-,3,336
319,Aug-25,14,-,1213,2,-,8,4.4K


In [42]:
# Transforming Hours Watched format
def convert_k(val):
    if isinstance(val, str):
        if val.endswith("K"):
            return float(val[:-1].replace(".", ""))* 1000
        elif val.endswith("M"):
            return float(val[:-1].replace(".", "")) * 1000000
    return float(val)

twitch_data_merged["Hours Watched"] = twitch_data_merged["Hours Watched"].map(convert_k)
twitch_data_merged

Unnamed: 0,Month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
0,Nov-16,18.0,-,164,2.0,-,7.0,32000.0
1,Dec-16,20.0,2,509,2.0,-,10.0,80000.0
2,Jan-17,19.0,-1,543,4.0,2,13.0,88000.0
3,Feb-17,30.0,11,5782,2.0,-2,11.0,160000.0
4,Mar-17,18.0,-12,815,2.0,-,10.0,69000.0
...,...,...,...,...,...,...,...,...
316,Jan-17,18,-1,215,1,-,3,357.0
317,Dec-16,19,4,216,1,-,3,946.0
318,Nov-16,15,-,147,1,-,3,336.0
319,Aug-25,14,-,1213,2,-,8,44000.0


In [43]:
numeric_fields = [x for x in twitch_fields if x != "Month"]
twitch_data_merged[numeric_fields] = twitch_data_merged[numeric_fields].replace({",": "", "-": "0"}, regex=True).astype(float)
twitch_data_merged.head()

Unnamed: 0,Month,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
0,Nov-16,18.0,0.0,164.0,2.0,0.0,7.0,32000.0
1,Dec-16,20.0,2.0,509.0,2.0,0.0,10.0,80000.0
2,Jan-17,19.0,1.0,543.0,4.0,2.0,13.0,88000.0
3,Feb-17,30.0,11.0,5782.0,2.0,2.0,11.0,160000.0
4,Mar-17,18.0,12.0,815.0,2.0,0.0,10.0,69000.0


In [52]:
twitch_data_merged_grouped = twitch_data_merged.groupby(twitch_data_merged.columns[0]).sum()
twitch_data_merged_grouped.to_csv("../assets/TwitchTracker_merged_cleaned.csv", encoding="utf-8")
twitch_data_merged_grouped

Unnamed: 0_level_0,Avg Viewers,Gain,Peak Viewers,Avg Streams,Gain.1,Peak Streams,Hours Watched
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Apr-17,31841.0,19065.0,194939.0,837.0,44.0,1700.0,231142383.0
Apr-18,35550.0,3608.0,226561.0,828.0,97.0,1595.0,264160163.0
Apr-19,40003.0,2811.0,236322.0,1163.0,19.0,2309.0,297409000.0
Apr-20,103850.0,17377.0,502637.0,2419.0,355.0,4898.0,767338000.0
Apr-21,102192.0,4706.0,611693.0,1883.0,182.0,3685.0,756406604.0
...,...,...,...,...,...,...,...
Sep-20,78858.0,4374.0,465673.0,1557.0,110.0,3046.0,585133314.0
Sep-21,94510.0,8971.0,689355.0,1307.0,115.0,2625.0,702959449.0
Sep-22,73816.0,7049.0,693262.0,994.0,102.0,1973.0,514899411.0
Sep-23,86787.0,24030.0,546449.0,1773.0,804.0,3829.0,641661584.0


### Youtube Video Data from Youtube Data API

In [None]:
youtube_data = pd.read_csv("../assets/yt_counter_strike.csv")