# CapstoneOne Preprocessing

## Problem Summary:
With million of apps around nowadays, mobile app analytics is a great way to understand the existing strategy to drive growth and retention of future user. This data set contains more than 7000 Apple iOS mobile application details, e.g. size, price, genre, rating_count, description and etc. The data was extracted from the iTunes Search API at the Apple Inc website. The goal is to predict whether the overall rating for the app is more than 4 stars (1=yes, 0=no), which we think it a very good app

### Required Steps

* Create dummy or indicator features for categorical variables
* Standardize the magnitude of numeric features using a scaler
* Split your data into testing and training datasets

Load Packages

In [1]:
#load python packages
import os
import pandas as pd
import datetime 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
%matplotlib inline

currentdirectory = os.getcwd()
print(currentdirectory)

/Users/oluwafemibabatunde


In [2]:
path = '/Users/oluwafemibabatunde/Desktop/Springboard/capstone_one/apple-app/data'
os.chdir(path)
df = pd.read_csv('step2_output2.csv', index_col = 0) #loaded csv file with dropped price outlier from folder

In [3]:
df.tail()

Unnamed: 0,id,size_bytes,price,rating_count_tot,rating_count_ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,...,请联系我们,購読,贴心,通勤,遊んだことがあるかもしれないような無料ゲームや売り切りアプリが盛りだくさん,酒店,重要,随时随地,音乐,音量調整
5192,1105304995,14638080,0.0,0,0,4,Games,38,5,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5193,1070689426,129368064,0.99,166,154,9,Games,40,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5194,1052726188,1003165696,0.99,93,2,4,Games,40,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5195,404095058,9458688,1.99,28672,11,4,Games,37,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5196,984594553,135886848,0.99,15,0,4,Games,38,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Change prime_genre column to dummy variables

In [4]:
df = pd.concat([df,pd.get_dummies(df['prime_genre'], prefix='genre')],axis=1)
df.head()

Unnamed: 0,id,size_bytes,price,rating_count_tot,rating_count_ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,...,genre_News,genre_Photo & Video,genre_Productivity,genre_Reference,genre_Shopping,genre_Social Networking,genre_Sports,genre_Travel,genre_Utilities,genre_Weather
1,965748314,38805504,0.0,199,199,9,Games,43,3,16,...,0,0,0,0,0,0,0,0,0,0
2,307764057,25808896,0.0,182,0,17,Entertainment,37,5,3,...,0,0,0,0,0,0,0,0,0,0
3,1005783927,296790016,0.0,4104,143,4,Games,37,5,1,...,0,0,0,0,0,0,0,0,0,0
4,350642635,105379840,0.99,426463,680,9,Games,38,0,5,...,0,0,0,0,0,0,0,0,0,0
5,1025628019,113420288,0.0,14,1,12,Finance,37,0,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df = df.drop(['prime_genre'], axis=1)
df.head()

Unnamed: 0,id,size_bytes,price,rating_count_tot,rating_count_ver,cont_rating,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,...,genre_News,genre_Photo & Video,genre_Productivity,genre_Reference,genre_Shopping,genre_Social Networking,genre_Sports,genre_Travel,genre_Utilities,genre_Weather
1,965748314,38805504,0.0,199,199,9,43,3,16,0,...,0,0,0,0,0,0,0,0,0,0
2,307764057,25808896,0.0,182,0,17,37,5,3,1,...,0,0,0,0,0,0,0,0,0,0
3,1005783927,296790016,0.0,4104,143,4,37,5,1,1,...,0,0,0,0,0,0,0,0,0,0
4,350642635,105379840,0.99,426463,680,9,38,0,5,1,...,0,0,0,0,0,0,0,0,0,0
5,1025628019,113420288,0.0,14,1,12,37,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.shape

(4605, 4918)

In [7]:
df.reset_index(drop=True, inplace=True)

Standardizing magnitude of numerical features using scaler

In [8]:
from sklearn.preprocessing import StandardScaler

SS_scaler = StandardScaler()

col_names = ['size_bytes', 'rating_count_tot', 'rating_count_ver', 'sup_devices.num']

scaledCols = ['scaledSizeBytes', 'scaledRatingCountTot', 'scaledRatingCountVer', 'scaledSupDevicesNum']

dfScale = SS_scaler.fit_transform(df[col_names])

dfScale = pd.DataFrame(dfScale , columns=scaledCols)



dfScale.head()



Unnamed: 0,scaledSizeBytes,scaledRatingCountTot,scaledRatingCountVer,scaledSupDevicesNum
0,-0.497771,-0.186051,-0.068901,1.68806
1,-0.553536,-0.186278,-0.115406,-0.213563
2,0.609156,-0.133745,-0.081988,-0.213563
3,-0.212123,5.523515,0.043507,0.103374
4,-0.177624,-0.188529,-0.115173,-0.213563


In [9]:
dfScale.reset_index(drop = True, inplace =True)

In [10]:
dfScaled = pd.concat([df,pd.DataFrame(dfScale , columns=scaledCols)],axis=1)
dfScaled.head()

Unnamed: 0,id,size_bytes,price,rating_count_tot,rating_count_ver,cont_rating,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic,...,genre_Shopping,genre_Social Networking,genre_Sports,genre_Travel,genre_Utilities,genre_Weather,scaledSizeBytes,scaledRatingCountTot,scaledRatingCountVer,scaledSupDevicesNum
0,965748314,38805504,0.0,199,199,9,43,3,16,0,...,0,0,0,0,0,0,-0.497771,-0.186051,-0.068901,1.68806
1,307764057,25808896,0.0,182,0,17,37,5,3,1,...,0,0,0,0,0,0,-0.553536,-0.186278,-0.115406,-0.213563
2,1005783927,296790016,0.0,4104,143,4,37,5,1,1,...,0,0,0,0,0,0,0.609156,-0.133745,-0.081988,-0.213563
3,350642635,105379840,0.99,426463,680,9,38,0,5,1,...,0,0,0,0,0,0,-0.212123,5.523515,0.043507,0.103374
4,1025628019,113420288,0.0,14,1,12,37,0,1,1,...,0,0,0,0,0,0,-0.177624,-0.188529,-0.115173,-0.213563


In [11]:
dfScaled = dfScaled.drop(['size_bytes', 'rating_count_tot', 'rating_count_ver', 'sup_devices.num'], axis=1)
dfScaled.head()

Unnamed: 0,id,price,cont_rating,ipadSc_urls.num,lang.num,vpp_lic,rating,100,16,17,...,genre_Shopping,genre_Social Networking,genre_Sports,genre_Travel,genre_Utilities,genre_Weather,scaledSizeBytes,scaledRatingCountTot,scaledRatingCountVer,scaledSupDevicesNum
0,965748314,0.0,9,3,16,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,-0.497771,-0.186051,-0.068901,1.68806
1,307764057,0.0,17,5,3,1,0,0.0,0.0,0.0,...,0,0,0,0,0,0,-0.553536,-0.186278,-0.115406,-0.213563
2,1005783927,0.0,4,5,1,1,1,0.0,0.0,0.0,...,0,0,0,0,0,0,0.609156,-0.133745,-0.081988,-0.213563
3,350642635,0.99,9,0,5,1,1,0.0,0.0,0.0,...,0,0,0,0,0,0,-0.212123,5.523515,0.043507,0.103374
4,1025628019,0.0,12,0,1,1,0,0.0,0.0,0.0,...,0,0,0,0,0,0,-0.177624,-0.188529,-0.115173,-0.213563


Splitting dataframe into dependent and independent variables

In [12]:
X = dfScaled.drop(['id', 'rating'], axis =1)

y = dfScaled['rating']

Splitting dependent and independt variables into Test and Train data set

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [14]:
X_train.to_csv(r'/Users/oluwafemibabatunde/Desktop/Springboard/capstone_one/apple-app/data/X_train.csv')

In [15]:
X_test.to_csv(r'/Users/oluwafemibabatunde/Desktop/Springboard/capstone_one/apple-app/data/X_test.csv')

In [16]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [17]:
y_train.to_csv(r'/Users/oluwafemibabatunde/Desktop/Springboard/capstone_one/apple-app/data/y_train.csv')

In [18]:
y_test.to_csv(r'/Users/oluwafemibabatunde/Desktop/Springboard/capstone_one/apple-app/data/y_test.csv')