In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

## Day 30 Lecture 2 Assignment

In this assignment, we will learn about random forests. We will use the google play store dataset loaded below.

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report

# !pip install category_encoders
from category_encoders import LeaveOneOutEncoder

<IPython.core.display.Javascript object>

In [3]:
reviews = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/googleplaystore.csv"
)

reviews.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


<IPython.core.display.Javascript object>

In this assignment, you will work more independently. Perform the following steps:
    
1. Select which columns are best suited to predict whether the rating is above 4.5
2. Process the data (including transforming to the correct column type, removing missing values, creating dummy variables, and removing irrelevant variables)
3. Create a random forest model and evaluate
4. Using grid search cross validation, tweak the parameters to produce a better performing model
5. Show and discuss your results

Good luck!

In [4]:
drop_cols = [
    "App",
    "Size",
    "Price",
    "Genres",
    "Last Updated",
    "Current Ver",
    "Android Ver",
]
reviews = reviews.drop(columns=drop_cols)

<IPython.core.display.Javascript object>

In [5]:
reviews["Installs"] = reviews["Installs"].str.replace("+", "")
reviews["Installs"] = reviews["Installs"].str.replace(",", "")
reviews["Installs"] = pd.to_numeric(reviews["Installs"], errors="coerce")

<IPython.core.display.Javascript object>

In [6]:
reviews["Reviews"] = pd.to_numeric(reviews["Reviews"], errors="coerce")

<IPython.core.display.Javascript object>

In [7]:
reviews = reviews.dropna()

<IPython.core.display.Javascript object>

In [8]:
reviews["abv_4.5"] = reviews["Rating"] > 4.5
reviews["abv_4.5"] = reviews["abv_4.5"].astype(int)

<IPython.core.display.Javascript object>

In [9]:
X = reviews.drop(columns = ['abv_4.5', 'Rating'])
y = reviews['abv_4.5']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


<IPython.core.display.Javascript object>

In [42]:
cat_cols = ["Category", "Type", "Content Rating"]
num_cols = ["Reviews"]

<IPython.core.display.Javascript object>

In [43]:
encoder = LeaveOneOutEncoder(cols=cat_cols)
encoder.fit(X_train, y_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

<IPython.core.display.Javascript object>

In [44]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

<IPython.core.display.Javascript object>

In [45]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 0.9891884676988788
test_score: 0.791889007470651


<IPython.core.display.Javascript object>

In [46]:
%%time
grid = {
    'n_estimators': [150, 200, 250],
    'max_depth':[ 20,25,30],
    'min_samples_leaf':[3, 5,7],
    'criterion':[ 'entropy']
}
model = GridSearchCV(RandomForestClassifier(), param_grid = grid, verbose=1, n_jobs=-1, cv=5)
model.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   28.5s finished


Wall time: 30.3 s


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

<IPython.core.display.Javascript object>

In [47]:
model.best_params_

{'criterion': 'entropy',
 'max_depth': 25,
 'min_samples_leaf': 3,
 'n_estimators': 250}

<IPython.core.display.Javascript object>

In [48]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"train_score: {train_score}")
print(f"test_score: {test_score}")

train_score: 0.8829418045915643
test_score: 0.8009605122732124


<IPython.core.display.Javascript object>