In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
%matplotlib inline 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')


In [2]:
download_df_new = pd.read_csv('D:\\Data Science Roadmap and Practice\\Machine Learning Projects\\Playstore Apps Download Prediciton\\notebook\\Data\\downloads_new.csv')

In [3]:
download_df_new

Unnamed: 0,playstore app ID,Category,Rating,Reviews,Price,Content_Rating,OS_Version_Required,Downloads
0,24654,Finance,4.18,1481,0.0000,Everyone,Varies with device,100000
1,35329,Music And Audio,4.81,302,0.0000,Everyone,4.1 and up,5000
2,11044,Game Casual,4.27,374,0.0000,Everyone,4.1 and up,10000
3,36068,Business,4.03,122058,0.0000,Teen,Varies with device,10000000
4,35831,Medical,4.60,358,297.5742,Everyone,Varies with device,5000
...,...,...,...,...,...,...,...,...
16351,5583,Health And Fitness,4.30,13724,0.0000,Everyone,Varies with device,1000000
16352,15485,Beauty,4.73,70,0.0000,Everyone,4.1 and up,10000
16353,36065,Health And Fitness,4.60,5420,0.0000,Everyone,4.1 and up,500000
16354,12625,Productivity,4.60,1488289,0.0000,Everyone,Varies with device,100000000


Creating dependent and independent features

In [4]:
X = download_df_new.loc[:,['playstore app ID','Category','Rating','Reviews','Size','Price','Content_Rating','Last_Updated_On','Release_Version','OS_Version_Required']]
y = download_df_new.loc[:,['Downloads']]

KeyError: "['Size', 'Last_Updated_On', 'Release_Version'] not in index"

In [None]:
X

Unnamed: 0,playstore app ID,Category,Rating,Reviews,Size,Price,Content_Rating,Last_Updated_On,Release_Version,OS_Version_Required
0,24654,Finance,4.18,1481,Varies with device,0.0000,Everyone,May 05 2020,Varies with device,Varies with device
1,35329,Music And Audio,4.81,302,10M,0.0000,Everyone,Mar 26 2020,3.9.18,4.1 and up
2,11044,Game Casual,4.27,374,27M,0.0000,Everyone,May 01 2020,1.10.1,4.1 and up
3,36068,Business,4.03,122058,Varies with device,0.0000,Teen,May 02 2020,Varies with device,Varies with device
4,35831,Medical,4.60,358,Varies with device,297.5742,Everyone,Nov 29 2018,Varies with device,Varies with device
...,...,...,...,...,...,...,...,...,...,...
16351,5583,Health And Fitness,4.30,13724,Varies with device,0.0000,Everyone,Sep 21 2018,Varies with device,Varies with device
16352,15485,Beauty,4.73,70,7.9M,0.0000,Everyone,May 07 2020,0.2.5,4.1 and up
16353,36065,Health And Fitness,4.60,5420,21M,0.0000,Everyone,Jul 10 2019,2.48.3,4.1 and up
16354,12625,Productivity,4.60,1488289,Varies with device,0.0000,Everyone,Sep 07 2019,Varies with device,Varies with device


In [None]:
y

Unnamed: 0,Downloads
0,100000
1,5000
2,10000
3,10000000
4,5000
...,...
16351,1000000
16352,10000
16353,500000
16354,100000000


Creating column transformer 

In [None]:
numerical_feature = X.select_dtypes(exclude='object').columns
categorical_feature = X.select_dtypes(include='object').columns

numerical_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder',oh_transformer,categorical_feature),
        ('StandardScaler',numerical_transformer,numerical_feature)
    ]
)

In [None]:
X = preprocessor.fit_transform(X)

In [None]:
X.shape

(16356, 6282)

Train test split 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

<13084x6282 sparse matrix of type '<class 'numpy.float64'>'
	with 130840 stored elements in Compressed Sparse Row format>

In [None]:
y_train

Unnamed: 0,Downloads
12002,10000
10610,10000
8732,1000000
3322,50000
2913,1000000
...,...
13418,1000000
5390,10000000
860,500
15795,10000


In [None]:
def evaluate_model(true, predict):
    mae = mean_absolute_error(true, predict)
    mse = mean_squared_error(true, predict)
    rmse = np.sqrt(mean_squared_error(true, predict))
    r2_result = r2_score(true, predict)
    return mae, rmse, r2_result

In [None]:
models = {
    'Linear Regression' : LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'ElasticNet' : ElasticNet(),
    'SVR' : SVR(),
    'DecisionTreeRegression' : DecisionTreeRegressor(),
    'RandomForestRegression' : RandomForestRegressor(),
    'AdaboostRegression' : AdaBoostRegressor(),
    'GradientBoostRegression' : GradientBoostingRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) ## Training the models
    ## making the prediction
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)

    ## Evaluating the models performance
    model_train_mae, model_train_rmse, model_train_r2_result = evaluate_model(y_train,y_train_predict)
    model_test_mae, model_test_rmse, model_test_r2_result = evaluate_model(y_test,y_test_predict)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for test set')
    print('- Root Mean Squared error: {:.4f}'.format(model_test_rmse))
    print('- Mean Absolute error: {:.4f}'.format(model_test_mae))
    print('- R2 Score: {:.4f}'.format(model_test_r2_result))

    print('Model performance for train set')
    print('- Root Mean Squared error: {:.4f}'.format(model_train_rmse))
    print('- Mean Absolute error: {:.4f}'.format(model_train_mae))
    print('- R2 Score: {:.4f}'.format(model_train_r2_result))

    print('='*35)
    print('\n')





Linear Regression
Model performance for test set
- Root Mean Squared error: 9422657.2528
- Mean Absolute error: 4235187.3112
- R2 Score: 0.4749
Model performance for train set
- Root Mean Squared error: 6882944.6378
- Mean Absolute error: 2555153.5293
- R2 Score: 0.7603


Ridge
Model performance for test set
- Root Mean Squared error: 8933838.3230
- Mean Absolute error: 4052678.0278
- R2 Score: 0.5280
Model performance for train set
- Root Mean Squared error: 7710373.4377
- Mean Absolute error: 3197865.4671
- R2 Score: 0.6992


Lasso
Model performance for test set
- Root Mean Squared error: 9473535.3901
- Mean Absolute error: 4196732.4026
- R2 Score: 0.4692
Model performance for train set
- Root Mean Squared error: 6882949.5458
- Mean Absolute error: 2556178.1402
- R2 Score: 0.7603


ElasticNet
Model performance for test set
- Root Mean Squared error: 9765387.9171
- Mean Absolute error: 3881417.5754
- R2 Score: 0.4360
Model performance for train set
- Root Mean Squared error: 10933222.

As we can see above the scores for the different models for the train and test data the score is little off and somewhere we culd see the case of overfitting as well now we are getting this type of result the reason behing it is we have not used any hyper parameter tuning once we use the hyper parameter tuning this scores would get better and we will get the best scoring model for use. The hyper parameter tuning process will be taken place in the component section of main project under data transformation.py file.