# Pierce's Workspace

Use data science skills to investigate and answer a research question of interest

__Suggested approach__

Articulate a specific data science question

Identify, collect, manage, and wrangle data

Perform exploratory data analysis to identify trends and patterns

Measure the strength and magnitude of relationships (statistical approach)

Attempt to predict your outcome of interest (machine learning approach)


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline


from scipy.stats import ttest_ind
import statsmodels.formula.api as smf
from sklearn import metrics

# Load OLD prepped data

In [6]:
play_features = pd.read_csv('data/prepped/features_final.csv')
play_reviews = pd.read_csv('data/prepped/reviews_final.csv')

In [8]:
play_features.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [9]:
play_reviews.head(2)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462


## Predicting Category based on Title!

In [25]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(play_features.App)
X_train_counts.shape

(9360, 7604)

In [26]:
print(count_vect.vocabulary_.get(u'algorithm'))

None


In [28]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(9360, 7604)

In [29]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(9360, 7604)

In [32]:
clf = MultinomialNB().fit(X_train_tfidf, play_features.Category)

In [34]:
accuracy_score(clf.predict(X_train_counts), play_features.Category)

0.6476495726495727

## More Compact Assessment of Relationship between title and category

In [45]:
x1, x2, y1, y2 = train_test_split(play_features.App, play_features.Category, train_size=0.9, test_size=0.1)

tc_pipeline_1 = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())

tc_pipeline_1.fit(x1, y1)

accuracy_score(tc_pipeline_1.predict(x2), y2)

0.46153846153846156

# Load NEW Prepped Data

In [2]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

## Linear Modeling, Yo

In [3]:
play_features_installs = play_features.copy()
play_features_installs['installs'] = play_outcomes['installs']
play_features_installs.columns

Index(['app', 'category', 'size', 'type', 'price', 'content_rating', 'genres',
       'last_updated', 'current_ver', 'android_min_ver', 'installs'],
      dtype='object')

In [28]:
play_features_installs['installs'] = [s.replace("+", "") for s in play_features_installs['installs']]
play_features_installs['installs'] = [s.replace(",", "") for s in play_features_installs['installs']]
play_features_installs['installs'] = [float(s) for s in play_features_installs['installs']]

play_features_installs.head(1)

AttributeError: 'int' object has no attribute 'replace'

In [29]:
play_features_installs.head()

Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_min_ver,installs
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4_0_3,10000
1,Coloring book moana,ART_AND_DESIGN,14M,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4_0_3,500000
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,8.7M,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4_0_3,5000000
3,Sketch - Draw & Paint,ART_AND_DESIGN,25M,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4_2,50000000
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,2.8M,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4_4,100000


In [30]:
temp_data = play_features_installs

installs_model_1 = smf.ols(formula='installs ~ type + category + content_rating + genres + price - 1', data=temp_data).fit()

temp_data.head()
installs_model_1.summary()

0,1,2,3
Dep. Variable:,installs,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,3.459
Date:,"Sun, 10 Mar 2019",Prob (F-statistic):,5.73e-52
Time:,16:39:54,Log-Likelihood:,-184510.0
No. Observations:,9360,AIC:,369400.0
Df Residuals:,9162,BIC:,370800.0
Df Model:,197,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
type[Free],-4.918e+18,2.4e+20,-0.020,0.984,-4.76e+20,4.66e+20
type[Paid],-5.402e+07,8.41e+07,-0.642,0.521,-2.19e+08,1.11e+08
category[T.AUTO_AND_VEHICLES],-1.973e+18,3.56e+19,-0.055,0.956,-7.17e+19,6.78e+19
category[T.BEAUTY],3.171e+18,5.72e+19,0.055,0.956,-1.09e+20,1.15e+20
category[T.BOOKS_AND_REFERENCE],-1.94e+18,3.5e+19,-0.055,0.956,-7.05e+19,6.66e+19
category[T.BUSINESS],-3.2e+18,5.77e+19,-0.055,0.956,-1.16e+20,1.1e+20
category[T.COMICS],-2.202e+18,3.97e+19,-0.055,0.956,-8e+19,7.56e+19
category[T.COMMUNICATION],4.155e+18,7.49e+19,0.055,0.956,-1.43e+20,1.51e+20
category[T.DATING],1.828e+19,3.3e+20,0.055,0.956,-6.28e+20,6.64e+20

0,1,2,3
Omnibus:,13091.658,Durbin-Watson:,1.726
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2717580.19
Skew:,8.423,Prob(JB):,0.0
Kurtosis:,84.758,Cond. No.,8.42e+16
