In [17]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer

# Read the CSV file into a DataFrame
df = pd.read_csv('steam_games.csv')

# Remove records with 'N/A' values in any column
df = df.dropna()

# Remove records with empty brackets in the 'genres' column
df = df[df['Genre'] != '[]']

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_steam_games.csv', index=False)

In [21]:
# Download NLTK resources (required for tokenization and stopwords)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal and lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Remove special characters and punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    return processed_text

# Apply preprocessing to the 'description' column
df['Processed_Description'] = df['Description'].apply(preprocess_text)

# Save the updated DataFrame to a new CSV file
df.to_csv('preprocessed_data.csv', index=False)

In [25]:
# Load your DataFrame with the 'Processed_Description' column
df = pd.read_csv('preprocessed_data.csv')

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the CountVectorizer on the 'Processed_Description' column
vectorizer.fit(df['Processed_Description'])

# Transform the 'Processed_Description' column into a numeric feature matrix
X = vectorizer.transform(df['Processed_Description'])

# Convert the sparse matrix to a dense matrix and create a new DataFrame
df_numeric = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the new numeric DataFrame with the original DataFrame
df_final = pd.concat([df, df_numeric], axis=1)

# Save the final DataFrame to a new CSV file
df_final.to_csv('numeric_data.csv', index=False)