<a href="https://colab.research.google.com/github/DerivativeJRM07/gaming_data_analysis/blob/main/gaming_data_cleaning_%26_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Data Cleaning***

---



In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import os
import pandas as pd

In [83]:
df = pd.read_csv('/content/drive/MyDrive/Gaming_data_analysis_jrm/Gaming_Hours_vs_Performance_1000_Rows.csv')

In [84]:
print(df.columns)

Index(['User_ID', 'Age', 'Gender', 'Occupation', 'Game_Type',
       'Daily_Gaming_Hours', 'Weekly_Gaming_Hours', 'Primary_Gaming_Time',
       'Sleep_Hours', 'Stress_Level', 'Focus_Level', 'Academic_or_Work_Score',
       'Productivity_Level', 'Performance_Impact'],
      dtype='object')


## ***Checking for Null values***

In [85]:
print("Missing values per column")
print(df.isnull().sum())

Missing values per column
User_ID                   0
Age                       0
Gender                    0
Occupation                0
Game_Type                 0
Daily_Gaming_Hours        0
Weekly_Gaming_Hours       0
Primary_Gaming_Time       0
Sleep_Hours               0
Stress_Level              0
Focus_Level               0
Academic_or_Work_Score    0
Productivity_Level        0
Performance_Impact        0
dtype: int64


## ***Check for Duplicates Rows***

In [86]:
print("Number of duplicate rows")
print(df.duplicated().sum())

Number of duplicate rows
0


## ***Removing Inconsistent data***

In [87]:
df.columns=df.columns.str.lower().str.replace(' ','_')
print("New column names")
print(df.columns)

New column names
Index(['user_id', 'age', 'gender', 'occupation', 'game_type',
       'daily_gaming_hours', 'weekly_gaming_hours', 'primary_gaming_time',
       'sleep_hours', 'stress_level', 'focus_level', 'academic_or_work_score',
       'productivity_level', 'performance_impact'],
      dtype='object')


## ***Removing Irrevelant Columns***

In [88]:
removing_col = ['user_id', 'weekly_gaming_hours']

df_trimmed = df.drop(columns=removing_col)

print("--- Columns Remaining ---")
print(df_trimmed.columns.tolist())

display(df_trimmed.head(1))

--- Columns Remaining ---
['age', 'gender', 'occupation', 'game_type', 'daily_gaming_hours', 'primary_gaming_time', 'sleep_hours', 'stress_level', 'focus_level', 'academic_or_work_score', 'productivity_level', 'performance_impact']


Unnamed: 0,age,gender,occupation,game_type,daily_gaming_hours,primary_gaming_time,sleep_hours,stress_level,focus_level,academic_or_work_score,productivity_level,performance_impact
0,21,Male,Working Professional,Action,4.0,Morning,4.6,6,4,69,66,Negative


# ***Data Preparation***

---



### ***Trimming - Dropping irrelevant data***
done in above step

### ***Seasoning - Renaming the bigger columns into smaller one***

In [89]:
df_seasoned = df_trimmed.rename(columns={'academic_or_work_score': 'performance', 'primary_gaming_time': 'peak_time'})
print(df_seasoned.columns)
display(df_seasoned.head(2))

Index(['age', 'gender', 'occupation', 'game_type', 'daily_gaming_hours',
       'peak_time', 'sleep_hours', 'stress_level', 'focus_level',
       'performance', 'productivity_level', 'performance_impact'],
      dtype='object')


Unnamed: 0,age,gender,occupation,game_type,daily_gaming_hours,peak_time,sleep_hours,stress_level,focus_level,performance,productivity_level,performance_impact
0,21,Male,Working Professional,Action,4.0,Morning,4.6,6,4,69,66,Negative
1,35,Female,Student,Sports,1.0,Night,5.4,2,7,67,72,Neutral


### ***Measuring -  Changing categorial values to numbers***

In [90]:
impact_scale = {'Negative':0, 'Neutral': 1, 'Positive': 2}
df_seasoned['impact_score'] = df_seasoned['performance_impact'].map(impact_scale)
print(df_seasoned.columns)
display(df_seasoned.head(2))

Index(['age', 'gender', 'occupation', 'game_type', 'daily_gaming_hours',
       'peak_time', 'sleep_hours', 'stress_level', 'focus_level',
       'performance', 'productivity_level', 'performance_impact',
       'impact_score'],
      dtype='object')


Unnamed: 0,age,gender,occupation,game_type,daily_gaming_hours,peak_time,sleep_hours,stress_level,focus_level,performance,productivity_level,performance_impact,impact_score
0,21,Male,Working Professional,Action,4.0,Morning,4.6,6,4,69,66,Negative,0
1,35,Female,Student,Sports,1.0,Night,5.4,2,7,67,72,Neutral,1


### ***Pre-Mixing -  Add two relatable columns and making new column***

In [91]:
def mix_gamer_type(hours):
  if hours <= 2:
    return 'casual'
  elif 2 <= hours <= 5:
    return 'Regular'
  else:
    return 'Hardcore'

df_seasoned['health_index'] = (df_seasoned['sleep_hours'] + (10- df_seasoned['stress_level'])) /2

df_seasoned['gamer_type'] = df_seasoned['daily_gaming_hours'].apply(mix_gamer_type)
print(df_seasoned.columns)
display(df_seasoned.head(2))

Index(['age', 'gender', 'occupation', 'game_type', 'daily_gaming_hours',
       'peak_time', 'sleep_hours', 'stress_level', 'focus_level',
       'performance', 'productivity_level', 'performance_impact',
       'impact_score', 'health_index', 'gamer_type'],
      dtype='object')


Unnamed: 0,age,gender,occupation,game_type,daily_gaming_hours,peak_time,sleep_hours,stress_level,focus_level,performance,productivity_level,performance_impact,impact_score,health_index,gamer_type
0,21,Male,Working Professional,Action,4.0,Morning,4.6,6,4,69,66,Negative,0,4.3,Regular
1,35,Female,Student,Sports,1.0,Night,5.4,2,7,67,72,Neutral,1,6.7,casual


In [92]:
print(df.info())
print(df_clean.info())
print(df_seasoned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 1000 non-null   object 
 1   age                     1000 non-null   int64  
 2   gender                  1000 non-null   object 
 3   occupation              1000 non-null   object 
 4   game_type               1000 non-null   object 
 5   daily_gaming_hours      1000 non-null   float64
 6   weekly_gaming_hours     1000 non-null   float64
 7   primary_gaming_time     1000 non-null   object 
 8   sleep_hours             1000 non-null   float64
 9   stress_level            1000 non-null   int64  
 10  focus_level             1000 non-null   int64  
 11  academic_or_work_score  1000 non-null   int64  
 12  productivity_level      1000 non-null   int64  
 13  performance_impact      1000 non-null   object 
dtypes: float64(3), int64(5), object(6)
memory

In [93]:
df_seasoned.to_csv('gaming_data_ready_for_eda.csv', index=False)