In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import scipy.stats as stats
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.impute import KNNImputer

url = "https://raw.githubusercontent.com/Belphegorus86/Gacha-Analyse-them-all-Mid-Project-Ironhack/main/CSV/Gacha_Data_Final.csv"
df = pd.read_csv(url)

In [2]:
df = df.drop(['Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Index              104 non-null    int64 
 1   Game               92 non-null     object
 2   Region             92 non-null     object
 3   Downloads 03/2023  90 non-null     object
 4   Revenue 03/2023    92 non-null     object
 5   Downloads 04/2023  92 non-null     object
 6   Revenue 04/2023    91 non-null     object
 7   Downloads 05/2023  88 non-null     object
 8   Revenue 05/2023    88 non-null     object
 9   Downloads 06/2023  92 non-null     object
 10  Revenue 06/2023    92 non-null     object
 11  Downloads 07/2023  92 non-null     object
 12  Revenue 07/2023    92 non-null     object
 13  Downloads 08/2023  92 non-null     object
 14  Revenue 08/2023    92 non-null     object
 15  Downloads 09/2023  92 non-null     object
 16  Revenue 09/2023    92 non-null     object
 1

In [3]:
column_data = df['Game'].tolist()
column_data

['Genshin Impact',
 'Honkai Star Rail',
 'Honkai Star Rail',
 'Genshin Impact',
 'Uma Musume',
 'FGO',
 'Heaven Burns Red',
 'Blue Archive',
 'NIKKE',
 'Diablo Immoral',
 'Memento Mori',
 'Blue Archive',
 'Project Sekai',
 'FGO',
 'Fire Emblem Heroes',
 'Arknights',
 'Epic Seven',
 'Princess Connect',
 'FF7',
 'Atelier Resleriana',
 'IM@S Deresute',
 'Azur Lane',
 'Shironeko Project',
 'Azur Lane',
 'Arknights',
 'Bang Dream',
 'IM@S Mirishita',
 'Project Sekai',
 'Punishing Gray Raven',
 'Limbus Company',
 'Atelier Resleriana',
 'BrownDust2',
 'FF7',
 'World Dai Star',
 'Idoly Pride',
 'Tower of Fantasy',
 'Dolphin Wave',
 'Honkai 3rd',
 'Utawarerumono',
 'Assault Lily',
 'Counterside',
 'Reverse 1999',
 'Aether Gazer',
 'Eversoul',
 'Honkai 3rd',
 'Reverse 1999',
 'Towatsugai',
 'Tears of Themis',
 'Snowbreak',
 'Magia Record',
 'Touhou Gensou Eclipse',
 'Punishing Gray Raven',
 'Guardian Tales',
 'D4DJ',
 'Guardian Tales',
 'IM@S Shinymas',
 'Revue Starlight',
 'Blue Reflection Sun'

In [4]:
values_to_remove = ['Princess Connect', 'Revived Witch', 'LL SIF', 'Counterside', 'IM@S Deremas', 'Warau Arsnotoria', 'IM@S SideM', 'Dragalia Lost', np.nan]
df = df[~df['Game'].isin(values_to_remove)]
df

Unnamed: 0,Index,Game,Region,Downloads 03/2023,Revenue 03/2023,Downloads 04/2023,Revenue 04/2023,Downloads 05/2023,Revenue 05/2023,Downloads 06/2023,...,Downloads 10/2023,Revenue 10/2023,Downloads 11/2023,Revenue 11/2023,Downloads 12/2023,Revenue 12/2023,Downloads 01/2024,Revenue 01/2024,Downloads 02/2024,Revenue 02/2024
0,1,Genshin Impact,US,2800000,34000000,2900000,48000000,3000000,45000000,2700000,...,2.700.000,38000000,1700000,55000000,2700000,31000000,2700000,36000000,2600000,32000000
1,2,Honkai Star Rail,US,6000000,82000000,11000000,23000000,5000000,43000000,1700000,...,1.400.000,44000000,1200000,20000000,1600000,28000000,1300000,20000000,140000,30000000
2,3,Honkai Star Rail,CN,1000000,57000000,800000,52000000,650000,46000000,500000,...,400000,37000000,200000,1100000,300000,23000000,300000,10000000,300000,22000000
3,4,Genshin Impact,CN,600000,23000000,700000,46000000,700000,44000000,700000,...,700000,33000000,600000,44000000,110000,22000000,600000,23000000,700000,21000000
4,5,Uma Musume,JP,50000,27000000,50000,23000000,80000,74000000,30000,...,30000,21000000,30000,15000000,500000,20000000,40000,21000000,600000,20000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,80,404,JP,50000,600000,170000,190000,,,10000,...,10000,160000,10000,10000,0,0,0,0,0,0
80,81,Sinoalice,JP,10000,500000,10000,800000,10000,800000,10000,...,10000,140000,10000,10000,0,0,0,0,0,0
81,82,Sinoalice,US,10000,100000,10000,160000,10000,80000,10000,...,10000,10000,0,0,0,0,0,0,0,0
82,83,IM@S saisuta,JP,10000,25000,12000,400000,10000,600000,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.columns = df.columns.str.lower().str.replace('_', ' ')
df

Unnamed: 0,index,game,region,downloads 03/2023,revenue 03/2023,downloads 04/2023,revenue 04/2023,downloads 05/2023,revenue 05/2023,downloads 06/2023,...,downloads 10/2023,revenue 10/2023,downloads 11/2023,revenue 11/2023,downloads 12/2023,revenue 12/2023,downloads 01/2024,revenue 01/2024,downloads 02/2024,revenue 02/2024
0,1,Genshin Impact,US,2800000,34000000,2900000,48000000,3000000,45000000,2700000,...,2.700.000,38000000,1700000,55000000,2700000,31000000,2700000,36000000,2600000,32000000
1,2,Honkai Star Rail,US,6000000,82000000,11000000,23000000,5000000,43000000,1700000,...,1.400.000,44000000,1200000,20000000,1600000,28000000,1300000,20000000,140000,30000000
2,3,Honkai Star Rail,CN,1000000,57000000,800000,52000000,650000,46000000,500000,...,400000,37000000,200000,1100000,300000,23000000,300000,10000000,300000,22000000
3,4,Genshin Impact,CN,600000,23000000,700000,46000000,700000,44000000,700000,...,700000,33000000,600000,44000000,110000,22000000,600000,23000000,700000,21000000
4,5,Uma Musume,JP,50000,27000000,50000,23000000,80000,74000000,30000,...,30000,21000000,30000,15000000,500000,20000000,40000,21000000,600000,20000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,80,404,JP,50000,600000,170000,190000,,,10000,...,10000,160000,10000,10000,0,0,0,0,0,0
80,81,Sinoalice,JP,10000,500000,10000,800000,10000,800000,10000,...,10000,140000,10000,10000,0,0,0,0,0,0
81,82,Sinoalice,US,10000,100000,10000,160000,10000,80000,10000,...,10000,10000,0,0,0,0,0,0,0,0
82,83,IM@S saisuta,JP,10000,25000,12000,400000,10000,600000,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df = df.apply(lambda x: x.replace({',':''}, regex=True))
df

Unnamed: 0,index,game,region,downloads 03/2023,revenue 03/2023,downloads 04/2023,revenue 04/2023,downloads 05/2023,revenue 05/2023,downloads 06/2023,...,downloads 10/2023,revenue 10/2023,downloads 11/2023,revenue 11/2023,downloads 12/2023,revenue 12/2023,downloads 01/2024,revenue 01/2024,downloads 02/2024,revenue 02/2024
0,1,Genshin Impact,US,2800000,34000000,2900000,48000000,3000000,45000000,2700000,...,2.700.000,38000000,1700000,55000000,2700000,31000000,2700000,36000000,2600000,32000000
1,2,Honkai Star Rail,US,6000000,82000000,11000000,23000000,5000000,43000000,1700000,...,1.400.000,44000000,1200000,20000000,1600000,28000000,1300000,20000000,140000,30000000
2,3,Honkai Star Rail,CN,1000000,57000000,800000,52000000,650000,46000000,500000,...,400000,37000000,200000,1100000,300000,23000000,300000,10000000,300000,22000000
3,4,Genshin Impact,CN,600000,23000000,700000,46000000,700000,44000000,700000,...,700000,33000000,600000,44000000,110000,22000000,600000,23000000,700000,21000000
4,5,Uma Musume,JP,50000,27000000,50000,23000000,80000,74000000,30000,...,30000,21000000,30000,15000000,500000,20000000,40000,21000000,600000,20000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,80,404,JP,50000,600000,170000,190000,,,10000,...,10000,160000,10000,10000,0,0,0,0,0,0
80,81,Sinoalice,JP,10000,500000,10000,800000,10000,800000,10000,...,10000,140000,10000,10000,0,0,0,0,0,0
81,82,Sinoalice,US,10000,100000,10000,160000,10000,80000,10000,...,10000,10000,0,0,0,0,0,0,0,0
82,83,IM@S saisuta,JP,10000,25000,12000,400000,10000,600000,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
 columns_to_convert = [
    'downloads 03/2023', 'revenue 03/2023',
    'downloads 04/2023', 'revenue 04/2023',
    'downloads 05/2023', 'revenue 05/2023',
    'downloads 06/2023', 'revenue 06/2023',
    'downloads 07/2023', 'revenue 07/2023',
    'downloads 08/2023', 'revenue 08/2023',
    'downloads 09/2023', 'revenue 09/2023',
    'downloads 10/2023', 'revenue 10/2023',
    'downloads 11/2023', 'revenue 11/2023',
    'downloads 12/2023', 'revenue 12/2023',
    'downloads 01/2024', 'revenue 01/2024',
    'downloads 02/2024', 'revenue 02/2024'
]

df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82 entries, 0 to 83
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              82 non-null     int64  
 1   game               82 non-null     object 
 2   region             82 non-null     object 
 3   downloads 03/2023  81 non-null     float64
 4   revenue 03/2023    82 non-null     int64  
 5   downloads 04/2023  82 non-null     int64  
 6   revenue 04/2023    81 non-null     float64
 7   downloads 05/2023  78 non-null     float64
 8   revenue 05/2023    78 non-null     float64
 9   downloads 06/2023  82 non-null     int64  
 10  revenue 06/2023    82 non-null     int64  
 11  downloads 07/2023  82 non-null     int64  
 12  revenue 07/2023    82 non-null     int64  
 13  downloads 08/2023  82 non-null     int64  
 14  revenue 08/2023    82 non-null     int64  
 15  downloads 09/2023  82 non-null     int64  
 16  revenue 09/2023    82 non-null   

In [8]:
df.isnull().sum()/len(df)

index                0.000000
game                 0.000000
region               0.000000
downloads 03/2023    0.012195
revenue 03/2023      0.000000
downloads 04/2023    0.000000
revenue 04/2023      0.012195
downloads 05/2023    0.048780
revenue 05/2023      0.048780
downloads 06/2023    0.000000
revenue 06/2023      0.000000
downloads 07/2023    0.000000
revenue 07/2023      0.000000
downloads 08/2023    0.000000
revenue 08/2023      0.000000
downloads 09/2023    0.000000
revenue 09/2023      0.000000
downloads 10/2023    0.024390
revenue 10/2023      0.000000
downloads 11/2023    0.000000
revenue 11/2023      0.000000
downloads 12/2023    0.060976
revenue 12/2023      0.060976
downloads 01/2024    0.012195
revenue 01/2024      0.012195
downloads 02/2024    0.036585
revenue 02/2024      0.036585
dtype: float64

In [9]:
df.isnull().sum()

index                0
game                 0
region               0
downloads 03/2023    1
revenue 03/2023      0
downloads 04/2023    0
revenue 04/2023      1
downloads 05/2023    4
revenue 05/2023      4
downloads 06/2023    0
revenue 06/2023      0
downloads 07/2023    0
revenue 07/2023      0
downloads 08/2023    0
revenue 08/2023      0
downloads 09/2023    0
revenue 09/2023      0
downloads 10/2023    2
revenue 10/2023      0
downloads 11/2023    0
revenue 11/2023      0
downloads 12/2023    5
revenue 12/2023      5
downloads 01/2024    1
revenue 01/2024      1
downloads 02/2024    3
revenue 02/2024      3
dtype: int64

In [10]:
# Creathing new columns for total downloads and total revenue

downloads_columns = ['downloads 03/2023', 'downloads 04/2023', 'downloads 05/2023', 'downloads 06/2023',
                     'downloads 07/2023', 'downloads 08/2023', 'downloads 09/2023', 'downloads 10/2023',
                     'downloads 11/2023', 'downloads 12/2023', 'downloads 01/2024', 'downloads 02/2024']

revenue_columns = ['revenue 03/2023', 'revenue 04/2023', 'revenue 05/2023', 'revenue 06/2023',
                   'revenue 07/2023', 'revenue 08/2023', 'revenue 09/2023', 'revenue 10/2023',
                   'revenue 11/2023', 'revenue 12/2023', 'revenue 01/2024', 'revenue 02/2024']


df['total downloads'] = df[downloads_columns].sum(axis=1)
df['total revenue'] = df[revenue_columns].sum(axis=1)

df

Unnamed: 0,index,game,region,downloads 03/2023,revenue 03/2023,downloads 04/2023,revenue 04/2023,downloads 05/2023,revenue 05/2023,downloads 06/2023,...,downloads 11/2023,revenue 11/2023,downloads 12/2023,revenue 12/2023,downloads 01/2024,revenue 01/2024,downloads 02/2024,revenue 02/2024,total downloads,total revenue
0,1,Genshin Impact,US,2800000.0,34000000,2900000,48000000.0,3000000.0,45000000.0,2700000,...,1700000,55000000,2700000.0,31000000.0,2700000.0,36000000.0,2600000.0,32000000.0,29300000.0,455000000.0
1,2,Honkai Star Rail,US,6000000.0,82000000,11000000,23000000.0,5000000.0,43000000.0,1700000,...,1200000,20000000,1600000.0,28000000.0,1300000.0,20000000.0,140000.0,30000000.0,32640000.0,479000000.0
2,3,Honkai Star Rail,CN,1000000.0,57000000,800000,52000000.0,650000.0,46000000.0,500000,...,200000,1100000,300000.0,23000000.0,300000.0,10000000.0,300000.0,22000000.0,5550000.0,375100000.0
3,4,Genshin Impact,CN,600000.0,23000000,700000,46000000.0,700000.0,44000000.0,700000,...,600000,44000000,110000.0,22000000.0,600000.0,23000000.0,700000.0,21000000.0,7510000.0,352000000.0
4,5,Uma Musume,JP,50000.0,27000000,50000,23000000.0,80000.0,74000000.0,30000,...,30000,15000000,500000.0,20000000.0,40000.0,21000000.0,600000.0,20000000.0,1540000.0,331000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,80,404,JP,50000.0,600000,170000,190000.0,,,10000,...,10000,10000,0.0,0.0,0.0,0.0,0.0,0.0,320000.0,2030000.0
80,81,Sinoalice,JP,10000.0,500000,10000,800000.0,10000.0,800000.0,10000,...,10000,10000,0.0,0.0,0.0,0.0,0.0,0.0,90000.0,5250000.0
81,82,Sinoalice,US,10000.0,100000,10000,160000.0,10000.0,80000.0,10000,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,80000.0,900000.0
82,83,IM@S saisuta,JP,10000.0,25000,12000,400000.0,10000.0,600000.0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,32000.0,1025000.0


In [11]:
# create a model where the Nans have been transformed in "0"
X = df.drop(['total revenue', 'game', 'region'], axis=1)
Y = df['total revenue']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

X_train.head()

Unnamed: 0,index,downloads 03/2023,revenue 03/2023,downloads 04/2023,revenue 04/2023,downloads 05/2023,revenue 05/2023,downloads 06/2023,revenue 06/2023,downloads 07/2023,...,revenue 10/2023,downloads 11/2023,revenue 11/2023,downloads 12/2023,revenue 12/2023,downloads 01/2024,revenue 01/2024,downloads 02/2024,revenue 02/2024,total downloads
1,2,6000000.0,82000000,11000000,23000000.0,5000000.0,43000000.0,1700000,58000000,1600000,...,44000000,1200000,20000000,1600000.0,28000000.0,1300000.0,20000000.0,140000.0,30000000.0,32640000.0
31,32,0.0,0,0,0.0,0.0,0.0,0,0,0,...,800000,50000,600000,70000.0,1100000.0,130000.0,1400000.0,140000.0,800000.0,700000.0
76,77,10000.0,200000,10000,600000.0,10000.0,500000.0,10000,400000,10000,...,90000,10000,120000,0.0,0.0,0.0,0.0,0.0,0.0,90000.0
22,23,17000.0,2700000,17000,0.0,0.0,0.0,13000,1400000,27000,...,1100000,40000,2400000,20000.0,2000000.0,10000.0,1400000.0,10000.0,1500000.0,195000.0
55,56,10000.0,180000,40000,700000.0,30000.0,300000.0,10000,170000,100000,...,290000,13000,170000,100000.0,180000.0,10000.0,290000.0,10000.0,160000.0,373000.0


In [12]:
lr = LinearRegression()

lr.fit(X_train, Y_train)

lr.score(X_test,Y_test)

1.0

In [13]:
# created a model where we use Iterative Imputer to modify Nans
X = df.drop(['total revenue', 'game', 'region'], axis=1)
Y = df['total revenue']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

imputer = IterativeImputer(max_iter=2) 
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

lr_II = LinearRegression()

lr_II.fit(X_train_imputed, Y_train)

lr_II.score(X_test_imputed,Y_test)



0.9998026028085919

In [18]:
X = df.drop(['total revenue', 'game', 'region'], axis=1)
Y = df['total revenue']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

imputer = KNNImputer(n_neighbors=2)  
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

lr_KNN = LinearRegression()
lr_KNN.fit(X_train_imputed, Y_train)

lr_KNN.score(X_test_imputed, Y_test)

0.9993722825249544

In [15]:
nan_count = X_train_imputed.isna().sum().sum()
print("Number of NaN values in X_train_imputed:", nan_count)

AttributeError: 'numpy.ndarray' object has no attribute 'isna'