# Practical Machine Learning 
### Project: Model Selection

# Abstract

# Part 1: Business and Data Understanding

### Q. Define the problem domain

In [None]:
# Outcome a measure reading habits of individuals
# Useful to understand factors that determines reading habits
# Could be an index determine the most influential factor 

### Q. Identify candidate questions for your machine learning project

In [None]:
# What columns should be evaluated
# Can I use the dataset as-is or does it need cleaning
# Discreets or continuous numbers
# Features, Dimensions, Variables
# Value_counts

### Q. Identify a suitable dataset for your canditate question(s)

Go have a look at any of these websites:

* https://www.kaggle.com/datasets 
* https://datasetsearch.research.google.com/
* https://data.gov.uk/

Find an interesting looking data set related to your problem domain and get a copy of it


In [None]:
# import all needed libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# upload data
data = pd.read_csv('readinghabits.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.shape
print("dataset contain",data.shape[0], 'rows and',data.shape[1], 'columns')

In [None]:
out = data.describe().T
out

In [None]:
data.columns

In [None]:
# Cardinality - unique values count
for cname in data.columns:
  print(cname + " : " + str(data[cname].value_counts().count()))

# Part 2: Data Preparation

### Q. Discuss the following types of missing data and how they would be handled in reference to your dataset where applicable.
*	Missing completely at random (MCAR)
*	Missing at random (MAR)
*	Missing Not at Random (MNAR)

Q. Is there any correlation in the data? How would you decide which columns to keep?

In [None]:
df=data
new_df = df.dropna()
print(new_df.to_string())

In [None]:
df=data
new_df.dropna(inplace = True)
print(new_df.to_string())

In [None]:
new_df["Sex"] = np.where(new_df["Sex"] == "female", 0, 1)
print(new_df)

In [None]:
df.loc[df["Education"] == " College graduate", "Education"] = 1
df.loc[df["Education"] == " High school incomplete", "Education"] = 2
df.loc[df["Education"] == " Post-graduate training/professional school after college ", "Education"] = 3
df.loc[df["Education"] == " Some college, no 4-year degree", "Education"] = 4
df.loc[df["Education"] == " Technical, trade or vocational school AFTER high school", "Education"] = 5
df.loc[df["Education"] == " High school graduate", "Education"] = 6
print(new_df)

In [None]:
new_df.head

In [None]:
new_df.corr

In [None]:
new_df.columns

features = ['Age', 'Sex', 'Race', 'Marital status?', 'Education', 'Employement',
       'Incomes', 'How many books did you read during last 12months?',
       'Read any printed books during last 12months?',
       'Read any audiobooks during last 12months?',
       'Read any e-books during last 12months?', 'Last book you read, you…',
       'Do you happen to read any daily news or newspapers?',
       'Do you happen to read any magazines or journals?']

In [None]:
x = new_df.loc[:, features]  
y = new_df.loc[:, 'Education']  
x.dropna(inplace=True)
print(x.shape)
print(y.shape)
x.head()

In [None]:
x.tail()

In [None]:
corr = x.corr()
plt.figure()
sns.heatmap(corr)
plt.show()

In [None]:
print('*********')
all_label_values = list(dict(y.value_counts()).keys())
logic = lambda x: 0 if (x == all_label_values[0] or x== all_label_values[1]) else 1
y2 = y.map(logic)
y2[:10]

In [None]:
from sklearn.preprocessing import LabelEncoder 

for col in x.columns:
    le = LabelEncoder()
    x[col]= le.fit_transform(x[col])
    
x.head()


# Part 3: Model Selection


In [None]:
# train and test data
from sklearn.model_selection import train_test_split
xtrain,xtest, ytrain,ytest = train_test_split(x, y2, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder 

for col in x.columns:
    le = LabelEncoder()
    x[col]= le.fit_transform(x[col])
    
x.head()

In [None]:
corr = x.corr()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure()
sns.heatmap(corr)
plt.show()

### Q. Use the cheat sheet below to choose the algorithm/estimator suitable for building a model to address your candidate question(s)

* https://scikit-learn.org/stable/tutorial/machine_learning_map/

In [None]:
# xtrain,xtest, ytrain,ytest = train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC


ntree1, ntree2 = 50,100
from sklearn.metrics import accuracy_score
for algo in [RandomForestClassifier(n_estimators=ntree1), RandomForestClassifier(n_estimators=ntree2), GaussianNB(), DecisionTreeClassifier(), KNeighborsClassifier(), LinearSVC()]:
  model = algo
  model.fit(xtrain, ytrain)
  predictions = model.predict(xtest)
  print(accuracy_score(predictions, ytest))

# Part 4: Model Evaluation

### Q. Identify which of the statistical measures below are suitable for the evaluation of your model.

Classification Metrics:
* Accuracy
* Precision
* Recall
* F1 Score

Regression Metrics:
    
* Mean absolute error (MAE)
* Root mean squared error (RMSE)
* Relative absolute error (RAE)
* Relative squared error (RSE)
* Mean Zero One Error (MZOE)
* Coefficient of determination

 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
final_model = KNeighborsClassifier()
final_model.fit(xtrain,ytrain)
p = final_model.predict(xtest)
print(accuracy_score(p, ytest))

In [None]:
from joblib import dump
dump(final_model, 'reading_habits')
['reading_habits']

In [None]:
import sklearn
sklearn.__version__

# Part 5: Stretch - Model Deployment

### Q. Evaluate the open-source app framework for Machine Learning model deployment below in your own time.

* https://streamlit.io/

In [None]:
# I've ran out of subscribition, hence I could not do ML model and endpoints