## Mobile App Successfullness prediction

### Usage of different tehniques for the prediction as : 
- #### linear regression
- #### Decision trees/ Random Forest 
- #### KNN


## Step one Data preparation : Clearning , One-hot conding , Normalization

### Libraries

In [104]:
# basics
import numpy as np
import pandas as pd 

# for plots
import matplotlib.pyplot as plt
import seaborn as sns

# data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder

# ML Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
# Metrics
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [105]:
df = pd.read_csv("DataSet/googleplaystore.csv")
print(df.head())
df.columns

                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Art & Design;Pretend 

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [106]:
df.drop_duplicates(inplace=True)


In [107]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

### Data cleaning 

In [108]:
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')


In [109]:
def convert_size(size):
    if size.endswith('M'):
        return float(size[:-1]) * 1_000_000
    elif size.endswith('k'):
        return float(size[:-1]) * 1_000
    elif size == 'Varies with device':
        return np.nan
    return np.nan

df['Size'] = df['Size'].astype(str).apply(convert_size)
df['Size'].fillna(df['Size'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size'].fillna(df['Size'].median(), inplace=True)


In [110]:
# Step 1: Remove '+' and ',' characters
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True)

# Step 2: Convert to numeric (float), coercing errors to NaN
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

# Step 3 (Optional but recommended): Drop or handle rows where 'Installs' couldn't be converted
df = df.dropna(subset=['Installs'])

# Step 4 (Optional): Cast to int if you're sure no decimals needed
df['Installs'] = df['Installs'].astype(int)

In [111]:
# Step 1: Replace 'Free' with '$0' to make it consistent (optional)
df['Price'] = df['Price'].replace('Free', '$0')

# Step 2: Remove the dollar sign
df['Price'] = df['Price'].str.replace('$', '', regex=True)

# Step 3: Convert to float (use errors='coerce' to handle bad values gracefully)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Step 4: Drop rows where price couldn't be converted (optional)
df = df.dropna(subset=['Price'])

# Step 5: Reset index (optional cleanup)
df.reset_index(drop=True, inplace=True)


In [112]:
df = df[~df['Rating'].isnull()]


In [113]:


categoricals = ['Category', 'Type', 'Content Rating', 'Android Ver']
le = LabelEncoder()

for col in categoricals:
    df[col] = le.fit_transform(df[col].astype(str))


In [114]:
print(df[categoricals].head(100))


     Category  Type  Content Rating  Android Ver
0           0     0               1           14
1           0     0               1           14
2           0     0               1           14
3           0     0               4           17
4           0     0               1           19
..        ...   ...             ...          ...
96          1     0               1           14
97          1     0               1           12
98          2     0               1           23
99          2     0               1           16
100         2     0               1           16

[100 rows x 4 columns]


In [115]:
df['Genres'] = df['Genres'].astype(str).apply(lambda x: x.split(';')[0])
df['Genres'] = le.fit_transform(df['Genres'])
print(df['Genres'])

0        3
1        3
2        3
3        3
4        3
        ..
9585    15
9587    15
9588    15
9590     7
9591    24
Name: Genres, Length: 8279, dtype: int64


In [116]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'])
latest_date = df['Last Updated'].max()
df['Days Since Update'] = (latest_date - df['Last Updated']).dt.days


In [117]:
df.drop(['App', 'Last Updated', 'Current Ver'], axis=1, inplace=True)
df.info()
print(df["Genres"])
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 8279 entries, 0 to 9591
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Category           8279 non-null   int64  
 1   Rating             8279 non-null   float64
 2   Reviews            8279 non-null   float64
 3   Size               8279 non-null   float64
 4   Installs           8279 non-null   int64  
 5   Type               8279 non-null   int64  
 6   Price              8279 non-null   float64
 7   Content Rating     8279 non-null   int64  
 8   Genres             8279 non-null   int64  
 9   Android Ver        8279 non-null   int64  
 10  Days Since Update  8279 non-null   int64  
dtypes: float64(4), int64(7)
memory usage: 776.2 KB
0        3
1        3
2        3
3        3
4        3
        ..
9585    15
9587    15
9588    15
9590     7
9591    24
Name: Genres, Length: 8279, dtype: int64


Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver,Days Since Update
0,0,4.1,159.0,19000000.0,10000,0,0.0,1,3,14,213
1,0,3.9,967.0,14000000.0,500000,0,0.0,1,3,14,205
2,0,4.7,87510.0,8700000.0,5000000,0,0.0,1,3,14,7
3,0,4.5,215644.0,25000000.0,50000000,0,0.0,4,3,17,61
4,0,4.3,967.0,2800000.0,100000,0,0.0,1,3,19,49


### Data scaling 

In [118]:
from sklearn.preprocessing import StandardScaler

num_cols = ['Reviews', 'Size', 'Installs', 'Price', 'Days Since Update']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [119]:
## Check
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver,Days Since Update
0,0,4.1,-0.16842,-0.103256,-0.197917,0,0.0,1,3,14,-0.071953
1,0,3.9,-0.168151,-0.333489,-0.192435,0,0.0,1,3,14,-0.092964
2,0,4.7,-0.139375,-0.577537,-0.142095,0,0.0,1,3,14,-0.612992
3,0,4.5,-0.096771,0.173024,0.361304,0,0.0,4,3,17,-0.471166
4,0,4.3,-0.168151,-0.849212,-0.19691,0,0.0,1,3,19,-0.502683


In [120]:

# for dummy variable encoding for Categories
df2 = pd.get_dummies(df, columns=['Category'])
df2.head()

Unnamed: 0,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver,Days Since Update,...,Category_23,Category_24,Category_25,Category_26,Category_27,Category_28,Category_29,Category_30,Category_31,Category_32
0,4.1,-0.16842,-0.103256,-0.197917,0,0.0,1,3,14,-0.071953,...,False,False,False,False,False,False,False,False,False,False
1,3.9,-0.168151,-0.333489,-0.192435,0,0.0,1,3,14,-0.092964,...,False,False,False,False,False,False,False,False,False,False
2,4.7,-0.139375,-0.577537,-0.142095,0,0.0,1,3,14,-0.612992,...,False,False,False,False,False,False,False,False,False,False
3,4.5,-0.096771,0.173024,0.361304,0,0.0,4,3,17,-0.471166,...,False,False,False,False,False,False,False,False,False,False
4,4.3,-0.168151,-0.849212,-0.19691,0,0.0,1,3,19,-0.502683,...,False,False,False,False,False,False,False,False,False,False


In [121]:
def classify_installs(x):
    if x <= 10000:
        return 'Not Successful'
    elif x <= 100000:
        return 'Average'
    elif x <= 1000000:
        return 'Above Average'
    else:
        return 'Successful'


In [122]:
df['SuccessLevel'] = df['Installs'].apply(classify_installs)

In [123]:
le = LabelEncoder()
df['SuccessLevel'] = le.fit_transform(df['SuccessLevel'])

In [124]:
X = df.drop(labels = ['Installs' ,'SuccessLevel'], axis=1)  # Dropping the target and unnecessary columns
y = df['SuccessLevel']  # Target variable is now SuccessLevel (encoded)

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [125]:
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Make predictions
Results = model.predict(X_test)

# Since the predictions will be continuous, we'll round them to the nearest integer
Results = Results.round().astype(int)


In [126]:
def Evaluationmatrix_dict(y_true, y_predict, name='Linear - Integer'):
    dict_matrix = {}
    dict_matrix['Series Name'] = name
    dict_matrix['Mean Squared Error'] = metrics.mean_squared_error(y_true, y_predict)
    dict_matrix['Mean Absolute Error'] = metrics.mean_absolute_error(y_true, y_predict)
    dict_matrix['Mean Squared Log Error'] = metrics.mean_squared_log_error(y_true, y_predict)
    return dict_matrix


In [127]:
# Step 7: Initialize resultsdf if not already initialized
resultsdf = pd.DataFrame()  # Initialize if not already done
# Use pd.concat() instead of append
resultsdf = pd.concat([resultsdf, pd.DataFrame([Evaluationmatrix_dict(y_test, Results, name='Linear - Integer')])], ignore_index=True)

# Step 8: Dummy Encoding for other categorical features (if needed)
# Apply dummy encoding for 'Category' column
df2 = pd.get_dummies(df, columns=['Category'], drop_first=True)

# Create new features X_d and target y_d for the dummy encoded dataframe
X_d = df2.drop(labels=[  'SuccessLevel'], axis=1)  # Drop SuccessLevel as it's the target
y_d = df2['SuccessLevel']  # Using encoded SuccessLevel as the target

# Split the data into training and testing sets for dummy encoded data
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30, random_state=42)

# Train the Linear Regression model with dummy encoded data
model_d = LinearRegression()
model_d.fit(X_train_d, y_train_d)

# Predict and evaluate on dummy encoded data
Results_d = model_d.predict(X_test_d)
Results_d = Results_d.round().astype(int)

# Append the evaluation results for dummy encoding
resultsdf = pd.concat([resultsdf, pd.DataFrame([Evaluationmatrix_dict(y_test_d, Results_d, name='Linear - Dummy')])], ignore_index=True)

# Print the results dataframe
print(resultsdf)

        Series Name  Mean Squared Error  Mean Absolute Error  \
0  Linear - Integer                 0.0                  0.0   
1    Linear - Dummy                 0.0                  0.0   

   Mean Squared Log Error  
0                     0.0  
1                     0.0  


In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Step 7: Initialize resultsdf if not already initialized
resultsdf = pd.DataFrame()  # Initialize if not already done

# Assuming df is already loaded and preprocessed

# Function to classify SuccessLevel based on Installs
def classify_installs(x):
    if x <= 10000:
        return 'Not Successful'
    elif x <= 100000:
        return 'Average'
    elif x <= 1000000:
        return 'Above Average'
    else:
        return 'Successful'

# Apply the classify_installs function to create SuccessLevel column
df['SuccessLevel'] = df['Installs'].apply(classify_installs)

# Encode the SuccessLevel column into numeric values for classification
le = LabelEncoder()
df['SuccessLevel'] = le.fit_transform(df['SuccessLevel'])

# Step 8: Dummy Encoding for other categorical features
df2 = pd.get_dummies(df, columns=['Category'], drop_first=True)

# Create new features X_d and target y_d for the dummy encoded dataframe
X_d = df2.drop(labels=['Rating', 'Genres', 'SuccessLevel'], axis=1)  # Drop SuccessLevel as it's the target
y_d = df2['SuccessLevel']  # Using encoded SuccessLevel as the target

# Split the data into training and testing sets for dummy encoded data
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30, random_state=42)

# Train the Logistic Regression model with dummy encoded data
model_d = LogisticRegression(max_iter=1000)
model_d.fit(X_train_d, y_train_d)

# Predict and evaluate on dummy encoded data
Results_d = model_d.predict(X_test_d)

# Evaluation using classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test_d, Results_d))

# Confusion Matrix
cm = confusion_matrix(y_test_d, Results_d)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
cm_display.plot(cmap='Blues')
plt.title('Confusion Matrix for Success Level Prediction')
plt.show()

# Plotting Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test_d, Results_d, color='blue', alpha=0.5)
plt.plot([min(y_test_d), max(y_test_d)], [min(y_test_d), max(y_test_d)], color='red', linestyle='--')  # Ideal line
plt.xlabel('Actual Success Level')
plt.ylabel('Predicted Success Level')
plt.title('Actual vs Predicted Success Level')
plt.show()

# Plot Error Distribution (Residuals)
residuals = y_test_d - Results_d
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='purple', bins=30)
plt.xlabel('Residuals')
plt.title('Residuals Distribution (Error Distribution)')
plt.show()

# Add results to resultsdf for evaluation summary
resultsdf = pd.concat([resultsdf, pd.DataFrame([Evaluationmatrix_dict(y_test_d, Results_d, name='Logistic - Dummy')])], ignore_index=True)

# Print the results dataframe
print(resultsdf)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(0)