<a href="https://colab.research.google.com/github/AkulaAnshul/MLP/blob/main/GA_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/GA_2_dataset.csv', na_values=['?'])

print("="*80)
print("DATASET INFO")
print("="*80)
print(f"Shape of dataset: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())

DATASET INFO
Shape of dataset: (10000, 13)

First few rows:
   PlayerID   Age  Gender Location   GameGenre  PlayTimeHours  \
0     35900  37.0    Male    Other    Strategy      23.929404   
1     27085  25.0    Male      NaN      Action      22.755168   
2     39595  24.0  Female   Europe  Simulation      19.505292   
3     37440  26.0  Female   Europe         RPG      11.009645   
4     22882  17.0  Female      USA         RPG       0.581039   

   InGamePurchases GameDifficulty  SessionsPerWeek  AvgSessionDurationMinutes  \
0              NaN           Hard                3                        124   
1              1.0           Easy               14                         84   
2              0.0           Hard                3                        172   
3              NaN            NaN                3                         83   
4              1.0         Medium                5                        163   

   PlayerLevel  AchievementsUnlocked EngagementLevel  
0      

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset
df = pd.read_csv('/content/GA_2_dataset.csv', na_values=['?'])

print("="*80)
print("QUESTION 1: Columns with object datatype")
print("="*80)
print("Data types of columns:")
print(df.dtypes)
print("\nColumns with object datatype:")
object_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in object_cols:
    print(f"  - {col}")

print("\n" + "="*80)
print("QUESTION 2: Males from Europe who made InGamePurchases")
print("="*80)
males_europe_purchases = df[
    (df['Gender'] == 'Male') &
    (df['Location'] == 'Europe') &
    (df['InGamePurchases'] > 0)
].shape[0]
print(f"Answer: {males_europe_purchases}")

print("\n" + "="*80)
print("QUESTION 3: Players under 18 with PlayTimeHours > 10")
print("="*80)
players_under_18 = df[
    (df['Age'] < 18) &
    (df['PlayTimeHours'] > 10)
].shape[0]
print(f"Answer: {players_under_18}")

print("\n" + "="*80)
print("QUESTION 4: Create X and y, count total null values")
print("="*80)
# Create feature matrix X and target vector y
y = df['EngagementLevel']
X = df.drop('EngagementLevel', axis=1)

# Count total null values in the whole dataset
total_nulls = df.isnull().sum().sum()
print(f"Total null values in dataset: {total_nulls}")

print("\n" + "="*80)
print("QUESTION 5: Train-test split and category with least value counts")
print("="*80)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("\nValue counts in y_train:")
print(y_train.value_counts().sort_values())
print(f"\nCategory with least value counts in y_train: {y_train.value_counts().idxmin()}")

print("\n" + "="*80)
print("QUESTION 6: Imputation and sum of transformed Age column in test set")
print("="*80)
# Create copies to work with
X_train_imputed = X_train.copy()
X_test_imputed = X_test.copy()

# Impute Age with mean (excluding NaN values)
age_mean = X_train_imputed['Age'].mean()
print(f"Mean of Age in training set: {age_mean}")
X_train_imputed['Age'].fillna(age_mean, inplace=True)
X_test_imputed['Age'].fillna(age_mean, inplace=True)

# Impute Location with "Other"
X_train_imputed['Location'].fillna('Other', inplace=True)
X_test_imputed['Location'].fillna('Other', inplace=True)

# Impute GameDifficulty with most frequent value
game_diff_mode = X_train_imputed['GameDifficulty'].mode()[0]
print(f"Most frequent GameDifficulty in training set: {game_diff_mode}")
X_train_imputed['GameDifficulty'].fillna(game_diff_mode, inplace=True)
X_test_imputed['GameDifficulty'].fillna(game_diff_mode, inplace=True)

# Impute InGamePurchases with 0
X_train_imputed['InGamePurchases'].fillna(0, inplace=True)
X_test_imputed['InGamePurchases'].fillna(0, inplace=True)

# Sum of transformed Age column in test dataset
sum_age_test = X_test_imputed['Age'].sum()
print(f"\nSum of transformed Age column in test dataset: {sum_age_test:.2f}")

print("\n" + "="*80)
print("QUESTION 7: Preprocessing and sum of first 5 rows of transformed test set")
print("="*80)

# Drop PlayerID
X_train_preprocessed = X_train_imputed.drop('PlayerID', axis=1)
X_test_preprocessed = X_test_imputed.drop('PlayerID', axis=1)

print("Checking for missing values after imputation:")
print(f"Train set missing values: {X_train_preprocessed.isnull().sum().sum()}")
print(f"Test set missing values: {X_test_preprocessed.isnull().sum().sum()}")

# Define column groups
ordinal_feature = ['GameDifficulty']
nominal_features = ['Gender', 'Location', 'GameGenre']
numerical_features = ['Age', 'PlayTimeHours', 'InGamePurchases', 'SessionsPerWeek',
                     'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked']

# Ordinal Encoding for GameDifficulty
ordinal_encoder = OrdinalEncoder(categories=[['Easy', 'Medium', 'Hard']])
X_train_preprocessed['GameDifficulty'] = ordinal_encoder.fit_transform(
    X_train_preprocessed[['GameDifficulty']]
)
X_test_preprocessed['GameDifficulty'] = ordinal_encoder.transform(
    X_test_preprocessed[['GameDifficulty']]
)

# One-Hot Encoding for nominal features
X_train_encoded = pd.get_dummies(X_train_preprocessed, columns=nominal_features, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_preprocessed, columns=nominal_features, drop_first=True)

# Ensure both train and test have same columns
# Get all columns from train
train_cols = X_train_encoded.columns
# Add missing columns to test with 0s
for col in train_cols:
    if col not in X_test_encoded.columns:
        X_test_encoded[col] = 0
# Reorder test columns to match train
X_test_encoded = X_test_encoded[train_cols]

print(f"\nShape after encoding - Train: {X_train_encoded.shape}, Test: {X_test_encoded.shape}")

# Standard Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# Convert to DataFrame for easier inspection
X_train_final = pd.DataFrame(X_train_scaled, columns=X_train_encoded.columns)
X_test_final = pd.DataFrame(X_test_scaled, columns=X_test_encoded.columns)

print("\nFirst 5 rows of transformed test feature matrix:")
print(X_test_final.head())

# Calculate sum of first 5 rows
sum_first_5_rows = X_test_final.head().values.sum()
print(f"\nSum of all values in first 5 rows of transformed test feature matrix: {sum_first_5_rows:.2f}")

QUESTION 1: Columns with object datatype
Data types of columns:
PlayerID                       int64
Age                          float64
Gender                        object
Location                      object
GameGenre                     object
PlayTimeHours                float64
InGamePurchases              float64
GameDifficulty                object
SessionsPerWeek                int64
AvgSessionDurationMinutes      int64
PlayerLevel                    int64
AchievementsUnlocked           int64
EngagementLevel               object
dtype: object

Columns with object datatype:
  - Gender
  - Location
  - GameGenre
  - GameDifficulty
  - EngagementLevel

QUESTION 2: Males from Europe who made InGamePurchases
Answer: 299

QUESTION 3: Players under 18 with PlayTimeHours > 10
Answer: 453

QUESTION 4: Create X and y, count total null values
Total null values in dataset: 3337

QUESTION 5: Train-test split and category with least value counts
Train set size: 8000
Test set size: 2000

Va

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_imputed['Age'].fillna(age_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_imputed['Age'].fillna(age_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s