# Analyze Genre Co-Occurence Networks and Dependency Patterns

#### Import and Configuration



In [1]:
import pandas as pd
import numpy as np
import itertools
import networkx as nx

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


#### Load Data

In [3]:
df = pd.read_csv('steam.csv')

print(df.shape)
df.head()


(27075, 18)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


#### Clean and Extract Genres

In [4]:
# Ensure genres column exists
assert 'genres' in df.columns, "Expected a 'genres' column in the dataset."

# Parse genres into list
def parse_genres(x):
    if pd.isna(x):
        return []
    return [g.strip() for g in str(x).split(';') if g.strip()]

df['genre_list'] = df['genres'].apply(parse_genres)

# Quick sanity check
df[['name', 'genres', 'genre_list']].head(10)


Unnamed: 0,name,genres,genre_list
0,Counter-Strike,Action,[Action]
1,Team Fortress Classic,Action,[Action]
2,Day of Defeat,Action,[Action]
3,Deathmatch Classic,Action,[Action]
4,Half-Life: Opposing Force,Action,[Action]
5,Ricochet,Action,[Action]
6,Half-Life,Action,[Action]
7,Counter-Strike: Condition Zero,Action,[Action]
8,Half-Life: Blue Shift,Action,[Action]
9,Half-Life 2,Action,[Action]


#### Build Genre Matrix

In [5]:
# All unique genres in the dataset
all_genres = sorted({g for genres in df['genre_list'] for g in genres})

print("Number of unique genres:", len(all_genres))
print(all_genres)

# Create binary columns for each genre
for g in all_genres:
    col_name = f'genre_{g}'
    df[col_name] = df['genre_list'].apply(lambda gl: int(g in gl))

genre_cols = [f'genre_{g}' for g in all_genres]

df[genre_cols].head()


Number of unique genres: 29
['Accounting', 'Action', 'Adventure', 'Animation & Modeling', 'Audio Production', 'Casual', 'Design & Illustration', 'Documentary', 'Early Access', 'Education', 'Free to Play', 'Game Development', 'Gore', 'Indie', 'Massively Multiplayer', 'Nudity', 'Photo Editing', 'RPG', 'Racing', 'Sexual Content', 'Simulation', 'Software Training', 'Sports', 'Strategy', 'Tutorial', 'Utilities', 'Video Production', 'Violent', 'Web Publishing']


Unnamed: 0,genre_Accounting,genre_Action,genre_Adventure,genre_Animation & Modeling,genre_Audio Production,genre_Casual,genre_Design & Illustration,genre_Documentary,genre_Early Access,genre_Education,genre_Free to Play,genre_Game Development,genre_Gore,genre_Indie,genre_Massively Multiplayer,genre_Nudity,genre_Photo Editing,genre_RPG,genre_Racing,genre_Sexual Content,genre_Simulation,genre_Software Training,genre_Sports,genre_Strategy,genre_Tutorial,genre_Utilities,genre_Video Production,genre_Violent,genre_Web Publishing
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Genre Co-Occurrence Matrix

In [6]:
# Matrix: rows = games, columns = genres (0/1)
genre_matrix = df[genre_cols].values

# Co-occurrence counts: G^T * G
co_matrix = np.dot(genre_matrix.T, genre_matrix)

co_df = pd.DataFrame(co_matrix, index=all_genres, columns=all_genres)

# Diagonal = how many games contain that genre
genre_counts = pd.Series(np.diag(co_matrix), index=all_genres).sort_values(ascending=False)

print("Top genres by game count:")
display(genre_counts.head(10))


Top genres by game count:


Unnamed: 0,0
Indie,19421
Action,11903
Casual,10210
Adventure,10032
Strategy,5247
Simulation,5194
RPG,4311
Early Access,2954
Free to Play,1704
Sports,1322


#### Top Co-Occuring Genre Pairs

In [7]:
pairs = []

for i, g1 in enumerate(all_genres):
    for j in range(i + 1, len(all_genres)):
        g2 = all_genres[j]
        w = co_matrix[i, j]
        if w > 0:
            pairs.append((g1, g2, w))

pairs_df = (
    pd.DataFrame(pairs, columns=['genre_1', 'genre_2', 'co_count'])
    .sort_values('co_count', ascending=False)
)

print("Top co-occurring genre pairs:")
display(pairs_df.head(15))


Top co-occurring genre pairs:


Unnamed: 0,genre_1,genre_2,co_count
25,Action,Indie,9183
92,Casual,Indie,7978
43,Adventure,Indie,7623
20,Action,Adventure,4526
203,Indie,Strategy,3669
37,Adventure,Casual,3564
200,Indie,Simulation,3547
21,Action,Casual,3399
197,Indie,RPG,3215
46,Adventure,RPG,2410


# Identify core vs. peripheral genres in the Steam ecosystem

#### Dependency Patterns via Conditional Co-occurrence

In [8]:
dependency_rows = []

for base in all_genres:
    base_count = co_df.loc[base, base]
    if base_count == 0:
        continue

    for partner in all_genres:
        if partner == base:
            continue

        cij = co_df.loc[base, partner]
        if cij > 0:
            p_partner_given_base = cij / base_count

            dependency_rows.append({
                'base_genre': base,
                'partner_genre': partner,
                'co_count': cij,
                'P(partner|base)': round(p_partner_given_base, 3)
            })

dep_df = (
    pd.DataFrame(dependency_rows)
    .sort_values(['P(partner|base)', 'co_count'], ascending=False)
)

print("Strong dependency-style pairs (top 20):")
display(dep_df.head(20))


Strong dependency-style pairs (top 20):


Unnamed: 0,base_genre,partner_genre,co_count,P(partner|base)
139,Documentary,Accounting,1,1.0
140,Documentary,Casual,1,1.0
141,Documentary,Design & Illustration,1,1.0
142,Documentary,Indie,1,1.0
143,Documentary,Software Training,1,1.0
144,Documentary,Tutorial,1,1.0
145,Documentary,Web Publishing,1,1.0
465,Tutorial,Accounting,1,1.0
466,Tutorial,Casual,1,1.0
467,Tutorial,Design & Illustration,1,1.0


#### Build Genre Co-occurrence Network Graph

In [9]:
G = nx.Graph()

# Add nodes with their game counts
for g in all_genres:
    count = int(co_df.loc[g, g])
    if count > 0:
        G.add_node(g, game_count=count)

# Add edges for co-occurring genres above a minimum weight
min_co = 1  # you can increase this for a larger dataset to filter noise
for _, row in pairs_df.iterrows():
    if row['co_count'] >= min_co:
        G.add_edge(row['genre_1'], row['genre_2'], weight=int(row['co_count']))

print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())


Nodes: 29
Edges: 273


#### Identify Core vs. Peripheral Genres

In [10]:
# Basic centrality measures
degree_dict = dict(G.degree())
weighted_degree_dict = dict(G.degree(weight='weight'))
betweenness_dict = nx.betweenness_centrality(G, normalized=True)  # structural importance

rows = []
for g in G.nodes():
    rows.append({
        'genre': g,
        'game_count': G.nodes[g]['game_count'],
        'degree': degree_dict[g],
        'weighted_degree': weighted_degree_dict[g],
        'betweenness': betweenness_dict[g]
    })

stats_df = pd.DataFrame(rows)

# Normalize metrics [0,1] for combination
for col in ['game_count', 'degree', 'weighted_degree', 'betweenness']:
    max_val = stats_df[col].max()
    if max_val > 0:
        stats_df[f'{col}_norm'] = stats_df[col] / max_val
    else:
        stats_df[f'{col}_norm'] = 0.0

# Simple combined score
stats_df['core_score'] = stats_df[
    ['game_count_norm', 'degree_norm', 'weighted_degree_norm', 'betweenness_norm']
].mean(axis=1)

# Classify using median core_score as split (you can tune this)
threshold = stats_df['core_score'].median()
stats_df['role'] = np.where(stats_df['core_score'] >= threshold, 'core', 'peripheral')

# Sort for inspection
core_genres = stats_df[stats_df['role'] == 'core'].sort_values('core_score', ascending=False)
peripheral_genres = stats_df[stats_df['role'] == 'peripheral'].sort_values('core_score')

print("Core genres:")
display(core_genres[['genre', 'game_count', 'degree', 'weighted_degree', 'core_score']])

print("\nPeripheral genres:")
display(peripheral_genres[['genre', 'game_count', 'degree', 'weighted_degree', 'core_score']])


Core genres:


Unnamed: 0,genre,game_count,degree,weighted_degree,core_score
13,Indie,19421,28,42076,1.0
5,Casual,10210,28,23317,0.769971
23,Strategy,5247,26,12926,0.494268
2,Adventure,10032,20,23835,0.49148
17,RPG,4311,26,12053,0.477032
20,Simulation,5194,25,13894,0.472678
1,Action,11903,16,26951,0.460873
8,Early Access,2954,25,9375,0.416993
10,Free to Play,1704,26,4898,0.40096
6,Design & Illustration,87,20,271,0.235708



Peripheral genres:


Unnamed: 0,genre,game_count,degree,weighted_degree,core_score
7,Documentary,1,7,7,0.062554
24,Tutorial,1,7,7,0.062554
16,Photo Editing,12,15,62,0.134451
19,Sexual Content,245,15,911,0.142495
15,Nudity,266,15,1036,0.143508
11,Game Development,17,17,85,0.15392
4,Audio Production,29,17,85,0.154074
18,Racing,1024,15,2858,0.164091
12,Gore,537,16,2124,0.16705
3,Animation & Modeling,79,18,269,0.170137
