In [17]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)

# Create an object called iris with the iris data
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

#########################################################################
#create dataframe from yelp csv 
input_file = "yelp_reviews_min_test.csv"
df2 = pd.read_csv(input_file)
# put the original column names in a python list
headers = list(df2.columns.values)
print(headers)
##########################################################################

# View the top 5 rows
df.head()

# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# View the top 5 rows
df.head()

# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .5, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .5
df2['is_train'] = np.random.uniform(0, 1, len(df2)) <= .5

# View the top 5 rows
df.head()

# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
train2, test2 = df2[df2['is_train']==True], df2[df2['is_train']==False]

# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train2))
print('Number of observations in the test data:',len(test2))

# Create a list of the feature column's names
features = df.columns[:4]
features2 = df2.columns[:13]

# View features
features
features2

# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.
y = pd.factorize(train['species'])[0]
y2 = np.array(train2['review_stars'])
#y3 = pd.factorize(train2['business_category'])[0]
print(y)
print(y2)
#print(y3)

# View target
y
y2

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf2 = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)
clf2.fit(train2[features2], y2)

# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])
clf2.predict(test2[features2])

# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]
clf2.predict_proba(test2[features2])[0:10]

# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

# View the PREDICTED species for the first five observations
preds[0:5]

# View the ACTUAL species for the first five observations
test['species'].head()
test2['review_stars'].head()

# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])
pd.crosstab(test2['review_stars'], preds, rownames=['Actual Review'], colnames=['Predicted Review'])

# View a list of the features and their importance scores
list(zip(train[features], clf.feature_importances_))
list(zip(train2[features2], clf.feature_importances_))

#Test Comment

['user_review_count', 'user_useful', 'user_funny', 'user_cool', 'user_fans', 'user_avg_stars', 'business_stars', 'business_review_count', 'business_category', 'review_useful', 'review_funny', 'review_cool', 'review_stars']
('Number of observations in the training data:', 258)
('Number of observations in the test data:', 242)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 1 1 1 2 2 5 5 5 1 5 5 5 5 5 2 2 2 2 4
 5 5 5 1 1 1 1 3 3 3 3 4 4 5 5 1 1 1 2 4 4 4 4 4 4 4 4 4 5 5 3 3 4 5 5 5 5
 5 5 5 5 2 2 2 2 2 4 4 4 3 3 5 5 5 5 5 5 4 4 4 4 3 3 1 1 5 5 5 5 5 2 1 1 1
 1 1 1 1 4 4 2 2 2 5 5 5 5 5 5 3 5 5 1 1 5 5 5 5 1 1 1 1 5 5 5 5 5 5 5 4 4
 4 4 5 5 5 5 5 5 5 5 5 5 5 3 3 5 4 4 4 4 4 3 3 3 4 5 5 5 5 4 4 4 4 4 1 1 1
 5 5 5 1 2 2 2 2 2 5 5 5 5 3 3 5 3 3 3 5 5 5 5 4 2 5 5 5 5 5 4 5 5 5 5 5 3
 3 4 2 2 2 5 5 5 5 5 5 5 1 4 4 4 4 2 2 5 5 5 5 4 4 4 4 5 5 5 5 5 5 5 5 5]
[  0   1  

ValueError: could not convert string to float: Pizza