# Import Libraries


In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import re
from urllib.parse import urlparse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Train Only preprocessing**

In [62]:
def outliers(dataset,col):
    Q1 = dataset[col].quantile(0.25)
    Q3 = dataset[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound = Q1-1.5*IQR
    upper_bound = Q3+1.5*IQR
    dataset[col]=dataset[col].apply(lambda x: upper_bound if x> upper_bound else( lower_bound if x< lower_bound  else x))    
    return dataset

def dups(dataset):
  dataset.drop_duplicates(inplace = True, keep="first")   
  return dataset

**Train and Test preprocessing**

In [63]:
def null_elements(data):
    data["Languages"]=data["Languages"].fillna("EN")
    data["In-app Purchases"]=data["In-app Purchases"].fillna("0")
    data["Price"]=data["Price"].fillna("0")
    data.drop(columns=['Subtitle'], inplace=True)
    return data

def change_type(data):
  data['Age Rating'] = data['Age Rating'].str.replace('+', '', regex=False)
  data['Age Rating'] = data['Age Rating'].astype(int)
  age_rating_map = {4: 1, 9: 2, 12: 3, 17: 4}
  #Replace the age rating column with its corresponding integer value
  data['Age Rating'] = data['Age Rating'].replace(age_rating_map)
  # Convert to date time data type
  data['Original Release Date'] = pd.to_datetime(data['Original Release Date'])
  data['Current Version Release Date'] = pd.to_datetime(data['Current Version Release Date'])
  return data

#to get avarage of in_purches columns 
def avarage_Purchases(data,col):
  data[col]=data[col].fillna("0")
  data[col]=data[col].astype(str)
  data[col]=data[col].str.split(",")
  data[col]=[np.float64(x) for x in data[col]]
  from statistics import mean
  for i in range(data.shape[0]):
    data[col][i]=mean(data[col][i])
  data[col]=data[col].astype(float)
  return data

In [64]:
def cleaning(df):
   #avarage_Purchases
  df=avarage_Purchases(df,"In-app Purchases")
  #Analysis Columns Data Type
  change_type(df)
  # null_elements
  null_elements(df)
  #Languages
  #df = pd.concat([df.drop("Languages", 1), df["Languages"].str.get_dummies(sep=", ")], 1)
  #Genres
  df = pd.concat([df.drop("Genres", 1), df["Genres"].str.get_dummies(sep=", ")], 1)
  return df

Feature Engineering

In [65]:
def extract_us_word(links):
    us_words = []
    for link in links:
        parsed_url = urlparse(link)
        try:
            us_word = parsed_url.path.split('/')[1]
            us_words.append(us_word)
        except IndexError:
            print(f"Malformed link: {link}")
    return us_words


def extract_is_color(links):
    is_color_list = []
    for link in links:
        parsed_url = urlparse(link)
        netloc = parsed_url.netloc.lower()
        is_color_match = re.search(r"is\d", netloc)
        if is_color_match:
            is_color = is_color_match.group()
        else:
            is_color = None
        is_color_list.append(is_color)
    return is_color_list


def languages_count(df):
    return df['Languages'].str.count(',') + 1

def frequent_words_in_name(df):
  # Define the stop words list
  stop_words = set(stopwords.words('english'))
  # Create an empty Counter object to store the word frequencies
  word_freq = Counter()
  # Iterate over each row in the 'Name' column
  for name in df['Name']:
      # Check if the value is a string before tokenizing it
      if isinstance(name, str):
          # Tokenize the name string into words
          words = word_tokenize(name)
        
          # Filter out stop words and iterate over each word
          for word, pos in nltk.pos_tag(words):
              # Filter out stop words and check if the word is a noun or verb
              if word not in stop_words and (pos.startswith('N') or pos.startswith('V')):
                  # Add the word to the counter
                  word_freq[word.lower()] += 1

  # Get the 50 most frequent words
  most_common_words = [word[0] for word in word_freq.most_common(50)]

  # Replace each word in the 'Name' column with 1 if it matches one of the 50 most frequent words
  df['Name'] = df['Name'].apply(lambda x: sum(1 for word in word_tokenize(str(x).lower()) if word in most_common_words) + 1)
  return df


def count_dev_games(df):
  # Create a dictionary to store the frequency of each developer
  developer_freq = df['Developer'].value_counts().to_dict()

  # Replace each developer name with its frequency in the dataset
  df['Developer'] = df['Developer'].map(developer_freq)
  return df

def calc_duration(df):
  # Calculate the difference in years and add 1
  df['Years Since Release'] = (df['Current Version Release Date'].dt.year - df['Original Release Date'].dt.year) + 1 # add 1 for each row because if Years Since Release =0 make it 1 to avoid multipy by 0 in training model   
  return df

def price_range(df):
  df["Price"] = df["Price"].dropna().map(lambda x: "Free" if x == 0.00 else("Low Price" if 0.99 <= x <= 4.99 else("Medium Price" if 5.99 <= x <= 19.99 else "High Price")))
  return df

In [66]:
def feature_engineering(df):
    # country name extraction
    df['URL'] = extract_us_word(df['URL'])
    df.rename(columns = {'URL':'Country'}, inplace = True)
    # color extraction
    df['Icon URL'] = extract_is_color(df['Icon URL'])
    df.rename(columns = {'Icon URL':'Color'}, inplace = True)
    # price 
    df = price_range(df)
    # frequent words in name 
    #df = frequent_words_in_name(df)
    #df.rename(columns = {'Name':'frequent words in Name'}, inplace = True)
    # number of other games by dev 
    df = count_dev_games(df)
    df.rename(columns = {'Developer':'Other by developer'}, inplace = True)
    # number of languages
    df['Languages'] = languages_count(df)
    df.rename(columns = {'Languages':'Languages Count'}, inplace = True)
    # duration between orignal and current releases in years
    df = calc_duration(df)
    return df

def scaling(df):
  #use Min-Max scaling, which scales the data to a range between 0 and 1.
  # Initialize the MinMaxScaler object
  scaler = MinMaxScaler()
  # Fit and transform the features
  df['User Rating Count'] = scaler.fit_transform(df[['User Rating Count']])
  df['Size'] = scaler.fit_transform(df[['Size']])
  return df

In [69]:
#test
df = pd.read_csv('games-regression-dataset.csv')
df = cleaning(df)
outliarlist=["User Rating Count","In-app Purchases","Size"]
for i in outliarlist:
  outliers(df,i) 
df = dups(df)
df = scaling(df)
df = feature_engineering(df)
df.head()

  data['Original Release Date'] = pd.to_datetime(data['Original Release Date'])
  data['Current Version Release Date'] = pd.to_datetime(data['Current Version Release Date'])
  df = pd.concat([df.drop("Genres", 1), df["Genres"].str.get_dummies(sep=", ")], 1)
  df = pd.concat([df.drop("Genres", 1), df["Genres"].str.get_dummies(sep=", ")], 1)


Unnamed: 0,Country,ID,Name,Color,User Rating Count,Price,In-app Purchases,Description,Other by developer,Age Rating,...,Simulation,Social Networking,Sports,Stickers,Strategy,Travel,Trivia,Utilities,Word,Years Since Release
0,us,1264483706,HEIR OF LIGHT,is3,1.0,Free,14.602976,"A Dark Fantasy, Collectible RPG\n\nDarkness ha...",2,3,...,0,0,0,0,1,0,0,0,0,2
1,us,607705356,Endgame:Eurasia,is4,0.017381,Free,0.0,"""This interactive experience is an exploration...",2,3,...,1,0,0,0,1,0,0,0,0,5
2,us,627491527,Free Solitaire+,is5,0.011173,Free,0.0,Same Solitaire game with classic Solitaire run...,3,1,...,0,0,0,0,1,0,0,0,0,3
3,us,430252596,Draft Trainer,is1,0.103042,Low Price,0.0,** Discounted for a limited time **\n\nEver wo...,1,2,...,0,0,0,0,1,0,0,1,0,9
4,us,1115082819,Rogue Knight: Infested Lands,is2,0.009932,Low Price,0.0,Fight or sneak your way through hordes of mons...,1,3,...,0,0,0,0,1,0,0,0,0,3
