# Preprocessing(2) & Analysis 

## Importing Necessary Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.tree import plot_tree
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sb
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Loading Data

In [2]:
df = pd.read_csv('preprocessed_game_info.csv')

In [3]:
for column in df.columns:
    if df[column].dtype == 'object':
        try:
            df[column] = pd.to_numeric(df[column])
        except ValueError:
            # Handle the error if conversion to numeric fails
            print(f"Conversion failed for column: {column}")

Conversion failed for column: NAME
Conversion failed for column: DEVELOPERS


In [None]:
feature_colors = sns.color_palette('hls', n_colors=len(df.columns))
features = df.columns
feature_color_map = dict(zip(features, feature_colors))
feature_colors

## Initial Data Exploration

In [None]:
df.head()

In [None]:
df.info()

## Plot the Count of Unique Values for Each Feature 

In [None]:
unique_counts = df.nunique()

plt.figure(figsize=(10, 6))
sns.barplot(x=unique_counts.index, y=unique_counts.values, palette=feature_colors)
plt.xticks(rotation=90)
plt.title('Unique Values Count per Feature')
plt.ylabel('Count')
plt.xlabel('Feature')
plt.show()

## Title for Excluding Specific Columns from Numerical Column Selection

In [None]:
exclude_columns=['Windows', 'macOS', 'Linux']
numerical_columns = df.select_dtypes(include=[np.number]).columns
numerical_columns = [col for col in numerical_columns if col not in exclude_columns]
numerical_df = df[numerical_columns]

## Outlier Detection and Handling

In [None]:
def plot_boxplots(df, exclude_columns=[]):
    numerical_columns = df.select_dtypes(include=[np.number]).columns
    numerical_columns = [col for col in numerical_columns if col not in exclude_columns]
    num_columns = len(numerical_columns)
    
    num_rows = (num_columns + 2) // 3
    
    fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(18, 5 * num_rows))
    axes = axes.flatten() 
    
    for i, column in enumerate(numerical_columns):
        df.boxplot(column=column, ax=axes[i])
        axes[i].set_title(f'Box Plot of {column}')
    
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_boxplots(df, exclude_columns=['Windows', 'macOS', 'Linux'])


In [None]:
def suggest_outlier_handling_method(df, exclude_columns=[]):
    methods = {}
    
    numerical_columns = df.select_dtypes(include=[np.number]).columns
    numerical_columns = [col for col in numerical_columns if col not in exclude_columns]
    
    for column in numerical_columns:
        # Plot histogram and box plot
        fig, ax = plt.subplots(1, 2, figsize=(12, 6))
        sns.histplot(df[column], bins=30, kde=True, ax=ax[0])
        ax[0].set_title(f'Histogram of {column}')
        sns.boxplot(x=df[column], ax=ax[1])
        ax[1].set_title(f'Box Plot of {column}')
        plt.show()
        
        # Calculate basic statistics
        mean = df[column].mean()
        median = df[column].median()
        std = df[column].std()
        
        print(f'{column} Statistics:')
        print(f'Mean: {mean}, Median: {median}, Standard Deviation: {std}')
        
        # Determine outliers
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        
        print(f'Number of Outliers: {len(outliers)}')
        
        if len(outliers) / len(df) < 0.05:
            if abs(mean - median) / std > 0.5:
                methods[column] = 'transform'
            else:
                methods[column] = 'remove'
        else:
            methods[column] = 'cap'
        
        print(f'Suggested Method for {column}: {methods[column]}')
        print('-' * 40)
    
    return methods

In [None]:
suggested_methods = suggest_outlier_handling_method(df, exclude_columns=['Windows', 'macOS', 'Linux'])


In [None]:
suggested_methods

In [None]:
def handle_outliers(df, methods, exclude_columns=[]):

    df_cleaned = df.copy()  
    
    for column, method in methods.items():
        if column in exclude_columns:
            continue
        
        Q1 = df_cleaned[column].quantile(0.25)
        Q3 = df_cleaned[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        if method == 'remove':
            df_cleaned = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]
        elif method == 'cap':
            df_cleaned[column] = df_cleaned[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)
        elif method == 'transform':
            df_cleaned[column] = np.log1p(df_cleaned[column])  
        else:
            raise ValueError("Method must be either 'remove', 'cap', or 'transform'")
    
    return df_cleaned

In [None]:
df = handle_outliers(df, suggested_methods, exclude_columns=['Windows', 'Mac', 'Linux', 'PRICE'])


In [None]:
df.info()

In [None]:
plot_boxplots(df, exclude_columns=['Windows', 'macOS', 'Linux'])


## Normalization/Standardization

## Descriptive Statistics


In [None]:
df.describe()

## Correlation Heatmap

In [None]:
ax = sb.heatmap(numerical_df.corr(),annot=True, cmap="YlGnBu")

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)

In [None]:
corr_matrix = numerical_df.corr()
correlations = corr_matrix['PRICE'].abs().sort_values(ascending=False)

correlation_info = pd.DataFrame({
    'Correlation': corr_matrix['PRICE'].loc[correlations.index],
    'Type': ['+' if corr_matrix['PRICE'][col] > 0 else '-' for col in correlations.index]
})

print("\nCorrelation of features with the target (PRICE):")
correlation_info

In [None]:
target_corr =  corr_matrix['PRICE'].sort_values(ascending=False)
plt.figure(figsize=(8, 6))
sns.barplot(x=target_corr.index, y=target_corr, palette='coolwarm')
plt.title('Correlation of Features with PRICE (Target)')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Histograms

In [None]:
numerical_df.hist(bins=30, figsize=(15, 10))
plt.suptitle("Histograms of Numerical Columns")
plt.show()

##  Scatter Plots

## Pair Plots

### Top 20 DEVELOPERS

In [None]:
import warnings

warnings.filterwarnings("ignore")

# Explode
df_exploded = df.explode('DEVELOPERS')

top_20_developers = df_exploded['DEVELOPERS'].value_counts().nlargest(20).index

df_top_20 = df_exploded[df_exploded['DEVELOPERS'].isin(top_20_developers)]

plt.figure(figsize=(10, 8))

sb.countplot(y='DEVELOPERS', data=df_top_20, order=top_20_developers)
plt.show()

### Top 10 GENRES

In [None]:
# Explode
df_exploded = df.explode('STORE_GENRE')

top_10_genre = df_exploded['STORE_GENRE'].value_counts().nlargest(10).index

df_top_10 = df_exploded[df_exploded['STORE_GENRE'].isin(top_10_genre)]

plt.figure(figsize=(10, 8))

sb.countplot(y='STORE_GENRE', data=df_top_10, order=top_10_genre)
plt.show()

### Distribution of Publish Year

In [None]:
median_publish_year = df['PUBLISH_YEAR'].median()
filled_publish_year = df['PUBLISH_YEAR'].fillna(median_publish_year)

In [None]:
# Plot the histogram
plt.hist(filled_publish_year, bins=100)
plt.xlabel('Time')
plt.ylabel('Count')
plt.title('Distribution of Publish Year')
plt.show()


### Platform Analysis

In [None]:
windows_avg_price = df.loc[df['Windows'] == 1, 'PRICE'].mean()
linux_avg_price = df.loc[df['Linux'] == 1, 'PRICE'].mean()
mac_avg_price = df.loc[df['macOS'] == 1, 'PRICE'].mean()

platforms = ['Windows', 'Linux', 'Mac']
average_prices = [windows_avg_price, linux_avg_price, mac_avg_price]
colors = ['skyblue', 'plum', 'pink']

plt.figure(figsize=(10, 6))
bars = plt.bar(platforms, average_prices, color=colors)

for bar, avg_price in zip(bars, average_prices):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(avg_price, 2), va='bottom')  # va: vertical alignment

plt.title('Average Price by Platform')
plt.xlabel('Platform')
plt.ylabel('Average Price')
plt.show()

In [None]:

df['PlatformCount'] = df[['Windows', 'Linux', 'macOS']].sum(axis=1)

platform_groups = df.groupby('PlatformCount').agg({
    'RATING_SCORE': 'mean',
    'POSITIVE_REVIEWS': 'mean',
    'PRICE': 'mean',
    'NEGATIVE_REVIEWS': 'mean',
})

platform_groups.reset_index(inplace=True)

platform_groups

In [None]:
colors = ['skyblue', 'plum', 'pink']
metrics = ['RATING_SCORE', 'POSITIVE_REVIEWS', 'NEGATIVE_REVIEWS', 'PRICE']

for metric in metrics:
    plt.figure(figsize=(10, 5))
    plt.bar(platform_groups['PlatformCount'], platform_groups[metric], color=colors)
    plt.xlabel('Number of Platforms Supported')
    plt.ylabel(metric)
    plt.title(f'{metric} vs. Number of Platforms Supported')
    plt.xticks(range(1, 3))
    plt.show()

### Plot the average price for each year

In [None]:
print(sorted(df["PUBLISH_YEAR"].unique()))

In [None]:
average_price_per_year = df.groupby('PUBLISH_YEAR')['PRICE'].mean()

# Plot the average price for each year
plt.plot(average_price_per_year.index, average_price_per_year.values, marker='o')
plt.xlabel('Year')
plt.ylabel('Average Price')
plt.title('Average Price of Games per Year')
plt.grid(True)
plt.show()

###  Relation between Price and Genre

In [None]:
df['STORE_GENRE'] = df['STORE_GENRE'].astype(str)

df['PRIMARY_GENRE'] = df['STORE_GENRE'].apply(lambda x: re.findall(r'\b[A-Za-z]+\b', x)[0] if re.findall(r'\b[A-Za-z]+\b', x) else None)


In [None]:
# Mean Price Difference by Genre
print("Mean Prices by Genre")
mean_prices_by_genre = df.groupby('PRIMARY_GENRE')['PRICE'].mean().reset_index()
plt.figure(figsize=(10, 6))
plt.bar(mean_prices_by_genre['PRIMARY_GENRE'], mean_prices_by_genre['PRICE'], color='skyblue')
plt.xlabel('Genre')
plt.ylabel('Mean Price ($)')
plt.title('Mean Price by Genre')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Comparison of Top 20 Games based on Genre

top_20_games = df.sort_values(by='RATING_SCORE', ascending=False).head(20)

top_20_genre_counts = top_20_games['PRIMARY_GENRE'].value_counts().reset_index()
top_20_genre_counts.columns = ['PRIMARY_GENRE', 'COUNT']
print("Top 20 Games by Genre")

plt.figure(figsize=(10, 6))
plt.bar(top_20_genre_counts['PRIMARY_GENRE'], top_20_genre_counts['COUNT'], color='lightcoral')
plt.xlabel('Genre')
plt.ylabel('Count of Top 20 Games')
plt.title('Distribution of Genres Among Top 20 Games')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Summary statistics for the PRICE column

In [None]:
plt.hist(df['PRICE'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Price Distribution')
plt.show()

In [None]:
df = df.assign(**{"FREE": df.PRICE == 0})

free_prices_dict = dict(df.FREE.value_counts())
labels = "Free", "Paid"
sizes = [free_prices_dict[True], free_prices_dict[False]]
explode = (0.2, 0)

with sb.color_palette('colorblind'):
    plt.pie(sizes, explode=explode, labels=labels,
            autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal')
plt.show()

In [None]:
price_summary = df['PRICE'].describe()
price_summary

### Analyze the correlation between PRICE and POSITIVE_REVIEWS

In [None]:

price_positive_corr = df[['PRICE', 'POSITIVE_REVIEWS']].dropna()
correlation = price_positive_corr.corr().loc['PRICE', 'POSITIVE_REVIEWS']

plt.figure(figsize=(10, 6))
sns.regplot(x='PRICE', y='POSITIVE_REVIEWS', data=price_positive_corr, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Correlation between Price and Positive Reviews')
plt.xlabel('Price ($)')
plt.ylabel('Positive Reviews')
plt.grid(True)
plt.show()

correlation

### Developer Analysis

In [None]:
developer_game_count = df['DEVELOPERS'].value_counts()

developer_price_summary = df.groupby('DEVELOPERS')['PRICE'].mean().sort_values(ascending=False)

developer_summary = pd.DataFrame({
    'Game Count': developer_game_count,
    'Average Price': developer_price_summary
}).dropna().sort_values(by='Game Count', ascending=False)

developer_summary

### Relationship between TOTAL_REVIEW and PRICE

In [None]:
# Plot the relationship between TOTAL_REVIEW and PRICE
plt.scatter(df['TOTAL_REVIEW'], df['PRICE'], alpha=0.5)
plt.xlabel('Total Review Ratio')
plt.ylabel('Price')
plt.title('Relationship between Total Review Ratio and Price')
plt.grid(True)
plt.show()


In [None]:
sb.jointplot(x='TOTAL_REVIEW', y='PRICE', data=df, kind='reg', scatter_kws={'alpha':0.5})
plt.xlabel('Total Review Ratio')
plt.ylabel('Price')
plt.title('Relationship between Total Review Ratio and Price')
plt.show()


In [None]:
# Create a hexbin plot
sb.jointplot(x='TOTAL_REVIEW', y='PRICE', data=df, kind='hex', gridsize=20)
plt.xlabel('Total Review Ratio')
plt.ylabel('Price')
plt.title('Relationship between Total Review Ratio and Price')
plt.show()


In [None]:
df.nlargest(20, "TOTAL_REVIEW")["NAME"]

In [None]:
df['TOTAL_REVIEW'].nlargest(n=25)

In [None]:
df.query("TOTAL_REVIEW >= 0.99").sort_values(by="TOTAL_REVIEW", ascending=False)[["NAME", "DEVELOPERS", "STORE_GENRE", "TOTAL_REVIEW"]]