In [45]:
# Steps:
# 1. Import Data
# 2. Clean Data
# 3. Split Data into Training/Test Sets
# 4. Create Model
# 5. Train Model
# 6. Make Predictions
# 7. Evaluate and Improve

# Caggle.com is great to get data sets from 

# 1. Import Data
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

music_data = pd.read_csv('music.csv')
# music_data
# The input set here are the first two columns age and gender
# The output set is column three genre
# Training a model requires two sets, input and output

# -----------------------------------------------------------------------------------------

# 2. Clean Data - since all fields have corrosponding values, no cleaning needed in this ex
# 3. Split Data into Training/Test Sets
# This method creates a new data set without the selected column (input data set)
X = music_data.drop(columns=['genre'])
# Now inspect X - uppercase X by convention
# X
# Now for the output data set - lowercase y by convention
y = music_data['genre']
# y

# Best practice is to allocate 70-80% of data to training and 20-30% to testing
# test_size=0.2 is allocating 80% of data for training and randomly chooses data from those sets
# increasing test_size to 0.8 and data for training changes to 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# -----------------------------------------------------------------------------------------

# 4. Create Model
# Using Decision Tree Classifier for this project - pros and cons to each algorithm
# sklearn is package from sidekick learn library - most popular machine learning library
model = DecisionTreeClassifier()
# So when creating model, pass the _train data sets instead of this:
# model.fit(X, y)
model.fit(X_train, y_train)
# Parameters for this method is a two dimensional array - in this array, each element is an array
# This like a new reccord in the input set - 21 yrOld male and a 22 yrOld female
# predictions = model.predict([[21, 1], [22, 0]])
# Also when making prediction, use the X_test input data set
predictions = model.predict(X_test)
# predictions
# The output mean: 21 yrOld male likes HipHop and 22 yrOld female likes Dance

# -----------------------------------------------------------------------------------------

# Calculate Accuracy
score = accuracy_score(y_test, predictions)
# The score is 0-1.0 or 0%-100%
score

# Accuracy is (on average) higher when 80% of data is being used for training
# Accuracy is a lot less (on average) when only 20% of data is being used for training
# This happens because very little data is used to train the model
# The more data is given to the model and the cleaner the data is will give better results

# To run current cell without adding another cell press ctrl+enter

1.0