### DataAnalyses

## Libraries


In [370]:
# basics
import numpy as np
import pandas as pd 

# for plots
import matplotlib.pyplot as plt
import seaborn as sns

# data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# ML Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder

# Metrics
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [371]:
# Dataset
df = pd.read_csv("DataSet/googleplaystore.csv")
print(df.head())

                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Art & Design;Pretend 

In [372]:
df.describe()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [373]:
sns.set_style('darkgrid')

PALETTE = "Set3"
sns.set_palette(PALETTE)
sns.color_palette(PALETTE)

In [374]:
print(df.columns)

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')


In [375]:
print(df['Category'].nunique())

34


In [376]:
from sklearn.preprocessing import LabelEncoder

# Suppose you encoded 'Category'
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])

# To see the mapping:
print(list(le.classes_))

['1.9', 'ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY', 'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION', 'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FAMILY', 'FINANCE', 'FOOD_AND_DRINK', 'GAME', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME', 'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'MAPS_AND_NAVIGATION', 'MEDICAL', 'NEWS_AND_MAGAZINES', 'PARENTING', 'PERSONALIZATION', 'PHOTOGRAPHY', 'PRODUCTIVITY', 'SHOPPING', 'SOCIAL', 'SPORTS', 'TOOLS', 'TRAVEL_AND_LOCAL', 'VIDEO_PLAYERS', 'WEATHER']


### Data preparing/cleaning 

In [377]:
# Проверим сколько значений стали NaN при попытке конвертации
price_before = df['Price'].copy()

# Попробуем привести к числам, а некорректные сделаем NaN
price_converted = pd.to_numeric(price_before, errors='coerce')

# Посчитаем сколько значений стали NaN
num_invalid = price_converted.isna().sum()
total = len(price_before)

print(f"Некорректных значений: {num_invalid} из {total} ({num_invalid / total * 100:.2f}%)")


# Удаляем строки, где пропущены важные поля
df = df.dropna(subset=['Rating', 'Installs', 'Type', 'Content Rating'])

# Остальное заполняем
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce') #ревьюс в числа
df['Reviews'] = df['Reviews'].fillna(df['Reviews'].median())# там где пропущенно ставлю медиану

df['Size'] = df['Size'].replace('Varies with device', np.nan) #эти мне не нравятся так что я убираю их чтобы потом заменить
#дальше тут работа с ценой

df['Price'] = df['Price'].fillna('0') # там где пропуски будет ноль

#функция чтобы преобразовать миллионы тысячи которы в буквах
def parse_size(size):
    if isinstance(size, str):
        size = size.strip()
        if size.endswith('M'):
            return float(size[:-1])
        elif size.endswith('k'):
            return float(size[:-1]) / 1024
    return np.nan

df['Size'] = df['Size'].apply(parse_size)
df['Size'] = df['Size'].fillna(df['Size'].median())  # заполняем медианой

df['Price'] = df['Price'].str.replace('$', '', regex=True) #нафиг знак доллара
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0.0)#некорректные значения будут 00 и еще преобразует в числа



# Быстрый обзор
df.info()
df.describe()
df.head()

# Проверим сколько значений стали NaN при попытке конвертации
price_before = df['Price'].copy()

# Попробуем привести к числам, а некорректные сделаем NaN
price_converted = pd.to_numeric(price_before, errors='coerce')

#  сколько значений стали NaN
num_invalid = price_converted.isna().sum()
total = len(price_before)

print(f"Некорректных значений: {num_invalid} из {total} ({num_invalid / total * 100:.2f}%)")



Некорректных значений: 801 из 10841 (7.39%)
<class 'pandas.core.frame.DataFrame'>
Index: 9366 entries, 0 to 10840
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   App               9366 non-null   object 
 1   Category          9366 non-null   object 
 2   Rating            9366 non-null   float64
 3   Reviews           9366 non-null   int64  
 4   Size              9366 non-null   float64
 5   Installs          9366 non-null   object 
 6   Type              9366 non-null   object 
 7   Price             9366 non-null   float64
 8   Content Rating    9366 non-null   object 
 9   Genres            9366 non-null   object 
 10  Last Updated      9366 non-null   object 
 11  Current Ver       9362 non-null   object 
 12  Android Ver       9364 non-null   object 
 13  Category_encoded  9366 non-null   int64  
dtypes: float64(3), int64(2), object(9)
memory usage: 1.1+ MB
Некорректных значений: 0 из 9366 (0.00%

In [378]:
categoricals = ['Category', 'Type', 'Content Rating', 'Android Ver']
le = LabelEncoder()

for col in categoricals:
    df[col] = le.fit_transform(df[col].astype(str))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9366 entries, 0 to 10840
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   App               9366 non-null   object 
 1   Category          9366 non-null   int64  
 2   Rating            9366 non-null   float64
 3   Reviews           9366 non-null   int64  
 4   Size              9366 non-null   float64
 5   Installs          9366 non-null   object 
 6   Type              9366 non-null   int64  
 7   Price             9366 non-null   float64
 8   Content Rating    9366 non-null   int64  
 9   Genres            9366 non-null   object 
 10  Last Updated      9366 non-null   object 
 11  Current Ver       9362 non-null   object 
 12  Android Ver       9366 non-null   int64  
 13  Category_encoded  9366 non-null   int64  
dtypes: float64(3), int64(6), object(5)
memory usage: 1.1+ MB


In [379]:
df['Genres'] = df['Genres'].astype(str).apply(lambda x: x.split(';')[0])
df['Genres'] = le.fit_transform(df['Genres'])
print(df['Genres'])
df['Last Updated'] = pd.to_datetime(df['Last Updated'])
latest_date = df['Last Updated'].max()
df['Days Since Update'] = (latest_date - df['Last Updated']).dt.days
df.info()

0         3
1         3
2         3
3         3
4         3
         ..
10834    15
10836    15
10837    15
10839     7
10840    24
Name: Genres, Length: 9366, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 9366 entries, 0 to 10840
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   App                9366 non-null   object        
 1   Category           9366 non-null   int64         
 2   Rating             9366 non-null   float64       
 3   Reviews            9366 non-null   int64         
 4   Size               9366 non-null   float64       
 5   Installs           9366 non-null   object        
 6   Type               9366 non-null   int64         
 7   Price              9366 non-null   float64       
 8   Content Rating     9366 non-null   int64         
 9   Genres             9366 non-null   int64         
 10  Last Updated       9366 non-null   datetime64[ns]
 11  Curr

In [380]:
# Step 1: Remove '+' and ',' characters
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True)

# Step 2: Convert to numeric (float), coercing errors to NaN
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

# Step 3 (Optional but recommended): Drop or handle rows where 'Installs' couldn't be converted
df = df.dropna(subset=['Installs'])

# Step 4 (Optional): Cast to int if you're sure no decimals needed
df['Installs'] = df['Installs'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9366 entries, 0 to 10840
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   App                9366 non-null   object        
 1   Category           9366 non-null   int64         
 2   Rating             9366 non-null   float64       
 3   Reviews            9366 non-null   int64         
 4   Size               9366 non-null   float64       
 5   Installs           9366 non-null   int64         
 6   Type               9366 non-null   int64         
 7   Price              9366 non-null   float64       
 8   Content Rating     9366 non-null   int64         
 9   Genres             9366 non-null   int64         
 10  Last Updated       9366 non-null   datetime64[ns]
 11  Current Ver        9362 non-null   object        
 12  Android Ver        9366 non-null   int64         
 13  Category_encoded   9366 non-null   int64         
 14  Days Since U

In [381]:
print(df['Category'].nunique())

33


In [382]:
from sklearn.preprocessing import LabelEncoder

# Suppose you encoded 'Category'
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])

# To see the mapping:
print(list(le.classes_))

[np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32)]


In [383]:


# столбец успешности
def get_success_level(installs):
    if installs <= 10000:
        return 'Not Successful'
    elif installs <= 100000:
        return 'Average'
    elif installs <= 1000000:
        return 'Above Average'
    else:
        return 'Successful'

df['Success_Level'] = df['Installs'].apply(get_success_level)

# Кодируем метки
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Success_Level_Encoded'] = le.fit_transform(df['Success_Level'])
#ненужное
df = df.drop(columns=['App', 'Last Updated', 'Current Ver', 'Days Since Update','Genres'])
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9366 entries, 0 to 10840
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Category               9366 non-null   int64  
 1   Rating                 9366 non-null   float64
 2   Reviews                9366 non-null   int64  
 3   Size                   9366 non-null   float64
 4   Installs               9366 non-null   int64  
 5   Type                   9366 non-null   int64  
 6   Price                  9366 non-null   float64
 7   Content Rating         9366 non-null   int64  
 8   Android Ver            9366 non-null   int64  
 9   Category_encoded       9366 non-null   int64  
 10  Success_Level          9366 non-null   object 
 11  Success_Level_Encoded  9366 non-null   int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 951.2+ KB


In [384]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Android Ver,Category_encoded,Success_Level,Success_Level_Encoded
0,0,4.1,159,19.0,10000,0,0.0,1,14,0,Not Successful,2
1,0,3.9,967,14.0,500000,0,0.0,1,14,0,Above Average,0
2,0,4.7,87510,8.7,5000000,0,0.0,1,14,0,Successful,3
3,0,4.5,215644,25.0,50000000,0,0.0,4,17,0,Successful,3
4,0,4.3,967,2.8,100000,0,0.0,1,19,0,Average,1


In [385]:
print(df["Android Ver"].value_counts)

<bound method IndexOpsMixin.value_counts of 0        14
1        14
2        14
3        17
4        19
         ..
10834    16
10836    16
10837    16
10839    30
10840    30
Name: Android Ver, Length: 9366, dtype: int64>
