# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# 
# Read Data

In [2]:
df = pd.read_csv('games-regression-dataset.csv')

In [3]:
df.head()

Unnamed: 0,URL,ID,Name,Subtitle,Icon URL,User Rating Count,Price,In-app Purchases,Description,Developer,Age Rating,Languages,Size,Primary Genre,Genres,Original Release Date,Current Version Release Date,Average User Rating
0,https://apps.apple.com/us/app/heir-of-light/id...,1264483706,HEIR OF LIGHT,Dark Fantasy RPG,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,982,0.0,"29.99, 19.99, 9.99, 29.99, 29.99, 8.99, 4.99, ...","A Dark Fantasy, Collectible RPG\n\nDarkness ha...",GAMEVIL Inc.,12+,"EN, FR, DE, JA, KO, ZH, ES, TH, ZH, VI",894489600,Games,"Games, Role Playing, Strategy",6/3/2018,31/07/2019,4.0
1,https://apps.apple.com/us/app/endgame-eurasia/...,607705356,Endgame:Eurasia,,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,19,0.0,,"""This interactive experience is an exploration...",Auroch Digital Ltd,12+,EN,116407296,Games,"Games, Simulation, Strategy, News",21/03/2013,28/06/2017,3.5
2,https://apps.apple.com/us/app/free-solitaire/i...,627491527,Free Solitaire+,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,14,0.0,,Same Solitaire game with classic Solitaire run...,Chen Zhong Yuan,4+,"EN, ZH",50647040,Games,"Games, Strategy, Entertainment, Card",4/4/2013,21/04/2015,4.5
3,https://apps.apple.com/us/app/draft-trainer/id...,430252596,Draft Trainer,,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,88,1.99,,** Discounted for a limited time **\n\nEver wo...,"GG Wizards, LLC",9+,EN,28120064,Games,"Games, Utilities, Card, Strategy",26/05/2011,23/07/2019,3.5
4,https://apps.apple.com/us/app/rogue-knight-inf...,1115082819,Rogue Knight: Infested Lands,Tactical roguelike w/ stealth,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,13,3.99,,Fight or sneak your way through hordes of mons...,Luis Regueira,12+,EN,39915520,Games,"Games, Role Playing, Strategy",19/05/2017,6/2/2019,4.5


In [4]:
df.shape

(5214, 18)

# 
# Preprocessing Pipeline
- **`1. Analysis Columns Data Type`**
- **`2. Columns Nulls`**
- **`3. Rows Nulls`**
- **`4. Duplicates`**
- **`5. Outlires Detection & Removal`**

## 
### `1. Analysis Columns Data Type`

In [5]:
df.dtypes

URL                              object
ID                                int64
Name                             object
Subtitle                         object
Icon URL                         object
User Rating Count                 int64
Price                           float64
In-app Purchases                 object
Description                      object
Developer                        object
Age Rating                       object
Languages                        object
Size                              int64
Primary Genre                    object
Genres                           object
Original Release Date            object
Current Version Release Date     object
Average User Rating             float64
dtype: object

> <br> 
> 
> **Age Rating**
> - Remove + sign 
> - Convert to int
> - Notice that the column has only 4 ages so we can categorize them

In [6]:
df['Age Rating'] = df['Age Rating'].str.replace('+', '', regex=False)

In [7]:
print(df['Age Rating'].dtype)

object


In [8]:
df['Age Rating'] = df['Age Rating'].astype(int)

In [9]:
print(df['Age Rating'].dtype)

int32


In [10]:
df['Age Rating'].unique()

array([12,  4,  9, 17])

In [11]:
# Create a dictionary to map the age ratings to integers
age_rating_map = {4: 1, 9: 2, 12: 3, 17: 4}

In [12]:
# Replace the age rating column with its corresponding integer value
df['Age Rating'] = df['Age Rating'].replace(age_rating_map)

In [13]:
df['Age Rating'].head()

0    3
1    3
2    1
3    2
4    3
Name: Age Rating, dtype: int32

In [14]:
df['Age Rating'].dtype

dtype('int32')

> <br>
> 
> **Languages**

In [15]:
df['Languages'].head()

0    EN, FR, DE, JA, KO, ZH, ES, TH, ZH, VI
1                                        EN
2                                    EN, ZH
3                                        EN
4                                        EN
Name: Languages, dtype: object

In [16]:
print(df['Languages'].dtype)

object


In [17]:
all_languages = list(set(','.join(df['Languages'].fillna('').unique()).split(',')))

In [18]:
len(all_languages)

130

> <br>
> 
> **Genre**

In [36]:
df['Genres'].unique()

array(['Games, Role Playing, Strategy',
       'Games, Simulation, Strategy, News',
       'Games, Strategy, Entertainment, Card',
       'Games, Utilities, Card, Strategy',
       'Games, Strategy, Casual, Entertainment',
       'Games, Reference, Simulation, Strategy',
       'Games, Simulation, Entertainment, Strategy',
       'Games, Strategy, Racing, Entertainment',
       'Games, Puzzle, Strategy, Entertainment',
       'Games, Action, Strategy', 'Games, Trivia, Strategy, Education',
       'Games, Role Playing, Strategy, Entertainment',
       'Games, Entertainment, Casual, Strategy',
       'Games, Strategy, Entertainment, Adventure',
       'Sports, Games, Sports, Strategy',
       'Games, Puzzle, Entertainment, Strategy',
       'Games, Strategy, Sports, Sports',
       'Games, Strategy, Action, Books', 'Games, Puzzle, Strategy',
       'Games, Entertainment, Simulation, Strategy',
       'Games, Entertainment, Strategy, Puzzle',
       'Games, Strategy, Puzzle', 'Games, Stra

> ## 
> **Dates**
> - Convert to date time data type

In [20]:
df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
print(df['Original Release Date'].dtype)

  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Original Release Date'] = pd.to_datetime(df['Original Release Date'])
  df['Origin

datetime64[ns]


In [21]:
df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
print(df['Current Version Release Date'].dtype)

datetime64[ns]


  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Current Version Release Date'] = pd.to_datetime(df['Current Version Release Date'])
  df['Curr

In [22]:
df['Original Release Date'].head()

0   2018-06-03
1   2013-03-21
2   2013-04-04
3   2011-05-26
4   2017-05-19
Name: Original Release Date, dtype: datetime64[ns]

## 
### `2. Columns Nulls`

In [23]:
# print count of nulls for each column and percentage of them
missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().mean())*100})
missing_data

Unnamed: 0,total_missing,perc_missing
URL,0,0.0
ID,0,0.0
Name,0,0.0
Subtitle,3749,71.90257
Icon URL,0,0.0
User Rating Count,0,0.0
Price,0,0.0
In-app Purchases,2039,39.106252
Description,0,0.0
Developer,0,0.0


In [24]:
df.drop(columns=['Subtitle'], inplace=True)

In [25]:
# print count of nulls for each column and percentage of them
missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().mean())*100})
missing_data

Unnamed: 0,total_missing,perc_missing
URL,0,0.0
ID,0,0.0
Name,0,0.0
Icon URL,0,0.0
User Rating Count,0,0.0
Price,0,0.0
In-app Purchases,2039,39.106252
Description,0,0.0
Developer,0,0.0
Age Rating,0,0.0



> <br> 
> 
> **In-app Purchases**
> - We can assum that any cell with null value, does not has any purshases. So replace all nulls with 0

In [26]:
df['In-app Purchases'] = df['In-app Purchases'].fillna(0)

In [27]:
# print count of nulls for each column and percentage of them
missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().mean())*100})
missing_data

Unnamed: 0,total_missing,perc_missing
URL,0,0.0
ID,0,0.0
Name,0,0.0
Icon URL,0,0.0
User Rating Count,0,0.0
Price,0,0.0
In-app Purchases,0,0.0
Description,0,0.0
Developer,0,0.0
Age Rating,0,0.0


<br>

### `3. Rows Nulls`

In [28]:
df.shape

(5214, 17)

In [29]:
missing_rows = df.isnull().any(axis=1).sum()
missing_rows

11

In [30]:
df.dropna(inplace=True)

In [31]:
df.shape

(5203, 17)

In [32]:
missing_rows = df.isnull().any(axis=1).sum()
missing_rows

0

<br>

## `4. Duplicates`

In [33]:
print(df.duplicated().sum())

43


In [34]:
df.drop_duplicates(inplace = True, keep="first")

In [35]:
df.shape

(5160, 17)

<br>

## `5. Outlires Detection & Removal`

In [97]:
df.describe()

Unnamed: 0,ID,User Rating Count,Price,Age Rating,Size,Average User Rating
count,5160.0,5160.0,5160.0,5160.0,5160.0,5160.0
mean,868029200.0,3686.172,0.604031,1.639147,138072800.0,4.035271
std,294196800.0,50032.15,2.562607,0.883842,253669700.0,0.749178
min,284921400.0,5.0,0.0,1.0,215840.0,1.0
25%,595288300.0,13.0,0.0,1.0,27473920.0,3.5
50%,919383000.0,49.0,0.0,1.0,67059710.0,4.0
75%,1115590000.0,332.0,0.0,2.0,158436600.0,4.5
max,1341837000.0,3032734.0,139.99,4.0,4005591000.0,5.0


In [93]:
# calculate the interquartile range (IQR)
col = 'Price'
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1

# identify the outliers
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

# print the outliers
print("Outliers in User Rating Count column:")
print(outliers[col].head())

Outliers in User Rating Count column:
3    1.99
4    3.99
5    0.99
7    9.99
8    0.99
Name: Price, dtype: float64
