# Pierce's Workspace

Use data science skills to investigate and answer a research question of interest

__Suggested approach__

Articulate a specific data science question

Identify, collect, manage, and wrangle data

Perform exploratory data analysis to identify trends and patterns

Measure the strength and magnitude of relationships (statistical approach)

Attempt to predict your outcome of interest (machine learning approach)


In [46]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline


from scipy.stats import ttest_ind
import statsmodels.formula.api as smf
from sklearn import metrics

In [6]:
play_features = pd.read_csv('data/prepped/features_final.csv')
play_reviews = pd.read_csv('data/prepped/reviews_final.csv')

# Load OLD prepped data

In [8]:
play_features.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [9]:
play_reviews.head(2)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462


## Predicting Category based on Title!

In [25]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(play_features.App)
X_train_counts.shape

(9360, 7604)

In [26]:
print(count_vect.vocabulary_.get(u'algorithm'))

None


In [28]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(9360, 7604)

In [29]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(9360, 7604)

In [32]:
clf = MultinomialNB().fit(X_train_tfidf, play_features.Category)

In [34]:
accuracy_score(clf.predict(X_train_counts), play_features.Category)

0.6476495726495727

## More Compact Assessment of Relationship between title and category

In [45]:
x1, x2, y1, y2 = train_test_split(play_features.App, play_features.Category, train_size=0.9, test_size=0.1)

tc_pipeline_1 = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())

tc_pipeline_1.fit(x1, y1)

accuracy_score(tc_pipeline_1.predict(x2), y2)

0.46153846153846156

# Load Prepped Data

In [51]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

## Linear Modeling, Yo

In [54]:
play_features_installs = play_features.copy()
play_features_installs['installs'] = play_outcomes['installs']
play_features_installs.columns

Index(['app', 'category', 'size', 'type', 'price', 'content_rating', 'genres',
       'last_updated', 'current_ver', 'android_ver', 'installs'],
      dtype='object')

In [53]:
play_features_installs.head(1)

Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_ver,installs
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,"10,000+"


In [None]:
installs_model_1 = smf.ols(formula='installs ~ app + category + size + type + price + content_rating + genres + \
                                last_updated + current_ver + android_ver', data=play_features_installs).fit()

In [None]:
installs_model_1.summary()