In [134]:
import numpy as np
import pandas as pd

In [135]:
from sklearn.model_selection import train_test_split

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score

from sklearn import preprocessing

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, f1_score

In [136]:
## Loading the data
data_df = pd.read_csv('C:/Users/aarya/Documents/Summer_2019/Google Playstore Project/Data/googleplaystore.csv')

In [137]:
data_df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [138]:
features_df = data_df.drop(columns = ['App','Genres','Last Updated','Android Ver','Current Ver'])

In [139]:
features_df.columns

Index(['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price',
       'Content Rating'],
      dtype='object')

### Data Pre-processing

### Content rating (One hot encoding)

In [140]:
features_df["Content Rating"].fillna(method = 'ffill', inplace = True)

In [141]:
features_df = pd.get_dummies(features_df, columns=['Content Rating'])

In [142]:
features_df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,0,1,0,0,0,0
1,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,0,1,0,0,0,0
2,ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,0,1,0,0,0,0
3,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,0,0,0,0,1,0
4,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,0,1,0,0,0,0


### Type (converting into binary)

In [143]:
def type_cat(types):
    if types == 'Free':
        return 0
    else:
        return 1

features_df['Type'] = features_df['Type'].map(type_cat)
features_df.Type.fillna(method = 'ffill', inplace = True)

### Installs 



In [144]:
#features_df['Installs'] = [int(i[:-1].replace(',','')) for i in features_df['Installs']]

features_df.Installs=features_df.Installs.apply(lambda x: x.strip('+'))
features_df.Installs=features_df.Installs.apply(lambda x: x.replace(',',''))
features_df.Installs=features_df.Installs.replace('Free',np.nan)
#features_df.Installs.value_counts()
features_df.Installs.fillna(method = 'ffill', inplace = True)

### Size

In [145]:
def size(size):
    if 'M' in size:
        x = size[:-1]
        x = float(x)*1000000
        return(x)
    elif 'k' == size[-1:]:
        x = size[:-1]
        x = float(x)*1000
        return(x)
    else:
        return None

features_df["Size"] = features_df["Size"].map(size)

#filling Size which had NA
features_df.Size.fillna(method = 'ffill', inplace = True)

In [162]:
features_df.Size

0        19000000.0
1        14000000.0
2         8700000.0
3        25000000.0
4         2800000.0
5         5600000.0
6        19000000.0
7        29000000.0
8        33000000.0
9         3100000.0
10       28000000.0
11       12000000.0
12       20000000.0
13       21000000.0
14       37000000.0
15        2700000.0
16        5500000.0
17       17000000.0
18       39000000.0
19       31000000.0
20       14000000.0
21       12000000.0
22        4200000.0
23        7000000.0
24       23000000.0
25        6000000.0
26       25000000.0
27        6100000.0
28        4600000.0
29        4200000.0
            ...    
10811     3900000.0
10812    13000000.0
10813     2700000.0
10814    31000000.0
10815     4900000.0
10816     6800000.0
10817     8000000.0
10818     1500000.0
10819     3600000.0
10820     8600000.0
10821     2500000.0
10822     3100000.0
10823     2900000.0
10824    82000000.0
10825     7700000.0
10826     7700000.0
10827    13000000.0
10828    13000000.0
10829     7400000.0


### Price

In [146]:
features_df.Price=features_df.Price.apply(lambda x: x.strip('$'))

In [147]:
def price(types):
    if price == 'Everyone':
        return 0

features_df['Price'] = features_df['Price'].map(type_cat)

### Category (One hot coding)

In [148]:
features_df = pd.get_dummies(features_df, columns=['Category'])

In [149]:
rating_df = features_df['Rating']

In [151]:
features_df = features_df.drop(columns = ['Rating'])

In [152]:
features_df.columns

Index(['Reviews', 'Size', 'Installs', 'Type', 'Price',
       'Content Rating_Adults only 18+', 'Content Rating_Everyone',
       'Content Rating_Everyone 10+', 'Content Rating_Mature 17+',
       'Content Rating_Teen', 'Content Rating_Unrated', 'Category_1.9',
       'Category_ART_AND_DESIGN', 'Category_AUTO_AND_VEHICLES',
       'Category_BEAUTY', 'Category_BOOKS_AND_REFERENCE', 'Category_BUSINESS',
       'Category_COMICS', 'Category_COMMUNICATION', 'Category_DATING',
       'Category_EDUCATION', 'Category_ENTERTAINMENT', 'Category_EVENTS',
       'Category_FAMILY', 'Category_FINANCE', 'Category_FOOD_AND_DRINK',
       'Category_GAME', 'Category_HEALTH_AND_FITNESS',
       'Category_HOUSE_AND_HOME', 'Category_LIBRARIES_AND_DEMO',
       'Category_LIFESTYLE', 'Category_MAPS_AND_NAVIGATION',
       'Category_MEDICAL', 'Category_NEWS_AND_MAGAZINES', 'Category_PARENTING',
       'Category_PERSONALIZATION', 'Category_PHOTOGRAPHY',
       'Category_PRODUCTIVITY', 'Category_SHOPPING', 'Cat

## Train Test Split

In [153]:
from sklearn.model_selection import train_test_split

y = rating_df
X_train, X_test, y_train, y_test = train_test_split(features_df, y , test_size=0.30, random_state=42)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(7588, 45) (7588,)
(3253, 45) (3253,)


### Handling Missing Values

In [154]:
missing_value_indices = y_train[y_train.isnull()].index.values
len(missing_value_indices)

1017

In [155]:
y_train = y_train.fillna(y_train.mean())

In [156]:
missing_value_indices = y_train[y_train.isnull()].index.values
len(missing_value_indices)

0

In [157]:
features_df.columns[features_df.isna().any()].tolist()

[]

In [179]:
y_train

10367    4.195252
6889     4.100000
524      4.400000
862      3.100000
2094     4.500000
947      4.100000
4649     4.200000
1444     4.600000
1280     4.700000
10195    4.100000
122      4.500000
4989     4.600000
6615     3.800000
8866     4.000000
8963     3.500000
7948     4.500000
2302     4.600000
8161     4.195252
2004     4.600000
5786     3.800000
9849     4.300000
1005     4.000000
6322     4.200000
9598     4.100000
5740     4.200000
3055     4.400000
2885     4.400000
3393     4.800000
2132     4.300000
6480     4.195252
           ...   
9998     4.300000
9167     4.500000
2747     4.500000
2047     4.500000
7849     4.400000
2558     4.600000
9274     3.600000
8666     4.300000
6396     5.000000
3385     4.500000
4555     3.700000
1184     4.700000
6420     4.400000
5051     4.195252
5311     3.600000
2433     4.195252
6949     4.500000
10583    3.800000
769      4.200000
1685     4.400000
8322     4.195252
5578     4.500000
4426     4.400000
466      4.200000
6265     4

### Normalization

In [180]:
new = X_train["Price"].isin(["3.0M"]) 
  
# displaying data with gender = male only 
X_train[new] 

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,...,Category_PERSONALIZATION,Category_PHOTOGRAPHY,Category_PRODUCTIVITY,Category_SHOPPING,Category_SOCIAL,Category_SPORTS,Category_TOOLS,Category_TRAVEL_AND_LOCAL,Category_VIDEO_PLAYERS,Category_WEATHER


In [169]:
#min_max_scaler = preprocessing.MinMaxScaler()
#X_train_minmax = min_max_scaler.fit_transform(X_train)
#X_train_minmax

In [174]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 score: %.7f' % r2_score(y_test, y_pred))

# Plot outputs
plt.scatter(y_test,y_pred)

plt.show()

ValueError: could not convert string to float: '3.0M'