In [12]:
import pandas as pd

df = pd.read_csv("steam_games_dataset_kaggle.csv")
df = df.dropna()  # This will drop rows that contain any NaN values

#variables we will use
df1= df[["name", "desc_snippet","genre"]]

#delete missing values
df2 = pd.DataFrame(df1.dropna())

df = df2.head(5000)

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

df.to_csv('steam_games_dataset_kaggle_5000_clean.csv', index=False)
df.head()

Unnamed: 0,name,desc_snippet,genre
0,Devil May Cry 5,"The ultimate Devil Hunter is back in style, in...",Action
1,Life is Strange 2,"After a tragic incident, brothers Sean and Dan...",Adventure
2,Call of Duty®: WWII,Call of Duty® returns to its roots with Call o...,Action
3,Shadow Tactics: Blades of the Shogun,Shadow Tactics is a hardcore tactical stealth ...,"Indie,Strategy"
4,SCUM,SCUM aims to evolve the multiplayer open world...,"Action,Adventure,Indie,Massively Multiplayer,E..."


In [13]:
# Extract unique genres from the Kaggle dataset
unique_genres = set([genre for genres in df['genre'] for genre in genres.split(',')])

# Print the unique genres from the Kaggle dataset
print(unique_genres)

{'Casual', 'Strategy', 'Simulation', 'Early Access', 'Indie', 'RPG', 'Action', 'Massively Multiplayer', 'Adventure', 'Free to Play'}


In [14]:
# Create binary columns for each unique genre
for genre in unique_genres:
    df[genre] = df['genre'].apply(lambda x: int(genre in x))

# Print the updated dataset
df.head()

Unnamed: 0,name,desc_snippet,genre,Casual,Strategy,Simulation,Early Access,Indie,RPG,Action,Massively Multiplayer,Adventure,Free to Play
0,Devil May Cry 5,"The ultimate Devil Hunter is back in style, in...",Action,0,0,0,0,0,0,1,0,0,0
1,Life is Strange 2,"After a tragic incident, brothers Sean and Dan...",Adventure,0,0,0,0,0,0,0,0,1,0
2,Call of Duty®: WWII,Call of Duty® returns to its roots with Call o...,Action,0,0,0,0,0,0,1,0,0,0
3,Shadow Tactics: Blades of the Shogun,Shadow Tactics is a hardcore tactical stealth ...,"Indie,Strategy",0,1,0,0,1,0,0,0,0,0
4,SCUM,SCUM aims to evolve the multiplayer open world...,"Action,Adventure,Indie,Massively Multiplayer,E...",0,0,0,1,1,0,1,1,1,0


In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

# Download NLTK resources (required for tokenization and stopwords)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal and lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Remove special characters and punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    return processed_text

# Apply preprocessing to the 'description' column
df['processed_desc_snippet'] = df['desc_snippet'].apply(preprocess_text)

# Save the updated DataFrame to a new CSV file
df.to_csv('steam_games_dataset_kaggle_5000_clean_binary_genre.csv.csv', index=False)

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

df = pd.read_csv("preprocessed_kaggle_multi_labels2.csv")

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed_desc_snippet'])
y = df.iloc[:, 3:-1]  # Output labels (binary genre columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Convert X_train and y_train to NumPy arrays
X_train = X_train.toarray()
y_train = y_train.values

# Create a sequential model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='sigmoid'))  # Sigmoid activation for multi-label classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_test.toarray(), y_test.values))

# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test.values)
print("Accuracy:", accuracy)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78