### Task 1.

In [None]:
# Importing the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlxtend.preprocessing
import mlxtend.frequent_patterns

In [None]:
# Loading the data
data_ass = pd.read_csv('assignment_basket.csv')

data_ass.head()

In [None]:
# Most frequent items
data_ass.Item.value_counts()[:10]

In [None]:
# Creating transactional format for afternoon

data = data_ass[data_ass.period_day == 'afternoon']

grocery_list = data.groupby(['Transaction'])['Item'].apply(list).values.tolist()

encoder = mlxtend.preprocessing.TransactionEncoder().fit(grocery_list)

encoded_data = encoder.transform(grocery_list)

grocery_trans = pd.DataFrame(encoded_data, columns = encoder.columns_)

# Creating transactional format for morning

data_1 = data_ass[data_ass.period_day == 'morning']

grocery_list_1 = data_1.groupby(['Transaction'])['Item'].apply(list).values.tolist()

encoder_1 = mlxtend.preprocessing.TransactionEncoder().fit(grocery_list_1)

encoded_data_1 = encoder_1.transform(grocery_list_1)

grocery_trans_1 = pd.DataFrame(encoded_data_1, columns = encoder_1.columns_)


In [None]:
# Most frequent products in afternoon
print('Afternoon')
print(grocery_trans.sum().sort_values(ascending = False)[:10])

# Most frequent products in morning
print('Morning')
print(grocery_trans_1.sum().sort_values(ascending = False)[:10])

In [None]:
# Itemsets for afternoon
frequent_itemsets = mlxtend.frequent_patterns.apriori(grocery_trans, min_support = 0.001, max_len = 4, use_colnames = True)
frequent_itemsets.shape[0]

In [None]:
# Itemsets for morning
frequent_itemsets_1 = mlxtend.frequent_patterns.apriori(grocery_trans_1, min_support = 0.001, max_len = 4, use_colnames = True)
frequent_itemsets_1.shape[0]

In [None]:
# Rules for afternoon
rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.6)
rules.head()

In [None]:
# Rules for morning
rules_1 = mlxtend.frequent_patterns.association_rules(frequent_itemsets_1, metric = "confidence", min_threshold = 0.6)
rules_1.head()

In [None]:
# Egg in the morning: Bread
selection_1 = rules_1['antecedents'].apply(lambda x: 'Eggs' in x)
print(rules_1[selection_1])

In [None]:
# Coke and Juice in the afternoon: Sandwich
selection_2 = rules['antecedents'].apply(lambda x: 'Coke' in x and 'Juice' in x)
print(rules[selection_2])

In [None]:
# Toast in the morning or in the morning: Coffee
selection = rules['antecedents'].apply(lambda x: 'Toast' in x)
selection_1 = rules_1['antecedents'].apply(lambda x: 'Toast' in x)
print(rules[selection])
print(rules_1[selection_1])

## Task 2


In [None]:
# Libraries to be used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# Decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier


# Grid search
from sklearn.model_selection import GridSearchCV

# Regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE

In [None]:
# Data 
heart = pd.read_csv('patients.csv')
heart.head()

In [None]:
# Check data info
# Seems no missing values, only numerical variables (althoguh there are somewith only two possible values)
heart.info()

In [None]:
# some basic statistics
heart.describe()

In [None]:
# Not much difference in the number of patients in the two classes
heart.outcome.value_counts()

In [None]:
# Correlation
heart.corr()

In [None]:
# Boxplots are useful
# For example, heart rate is clearly higher for class 1
sns.boxplot(x = 'outcome', y = 'heart_rate', data = heart)

In [None]:
# Countplot for 0-1 variables
# For example, there is not much exercise in class 1
sns.countplot(x = "outcome", data = heart, hue = "exercise")

In [None]:
# Lets select the columns of interes
# Predictors
X = heart[heart.columns[:-1]]
# Outcome
y = heart['outcome']

# Then we crate training and test set, with 25% of the data in the test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# We can start with a single decision tree model

# Inititalize model, at least we set enough iterations
cancer_lr = LogisticRegression(max_iter = 1000)

# Use training data
lr_model = cancer_lr.fit(X_train,y_train)

# Evaluate perfomance
# As we can see, we can get quite a good result already

pred_lr = lr_model.predict(X_test)
print(confusion_matrix(y_test,pred_lr))
print(classification_report(y_test,pred_lr))

In [None]:
# We start by initializing the LR model, and optimize parameters
model = LogisticRegression()

# We can specify possible values for the number of iterations
iterations = [500, 600, 700, 800]

# We can try different C values
c_values = [0.01, 0.1, 1, 10, 100]

# Class weights
weights = ['balanced', {0:0.3, 1:0.7}]

# We define the grid as a dictionary, using the name of parameters as defined in LogistiRegression as keys
# We will have 4x5x2=40 possible combinations, i.e. 40 different models will be tested

grid = dict(max_iter = iterations, C = c_values, class_weight = weights)

# We specify the grid search
# Estimator is the initial model, param_grid is the dictionary specified above
# We can also specify what performance measure we want to optimize
# We can try with recall
grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='accuracy')

# We fit the training data

grid_result = grid_search.fit(X, y)

# Print out the best results
# It tells us what parameters ween need to chose to obtain the model with the best possible recall
# You can try with different settings and change also scoring to, e.g. accuracy

print("Best result is", grid_result.best_score_, 'using', grid_result.best_params_)

In [None]:
# Let's try decision trees

heart_tree = DecisionTreeClassifier(random_state = 42)

# This case we can specify possible values for 
# optimality criterion
criterion = ['gini', 'entropy']

# Maximum depth of the tree
max_depth = [2,4,6,8,10,12]

# Class weights
weights = ['balanced', {0:0.1, 1:0.9}]

# We define the grid, 24 possible models
grid = dict(criterion = criterion, max_depth = max_depth, class_weight = weights)

# We specify the grid search

grid_search = GridSearchCV(estimator=heart_tree, param_grid=grid, scoring='accuracy')

grid_result = grid_search.fit(X, y)

# Print out the best result
print("Best result is", grid_result.best_score_, 'using', grid_result.best_params_)

In [None]:
# We start by creating a decision tree palceholder

cancer_bag = DecisionTreeClassifier(random_state = 42)

# We can then create a bagging classifier object, specify the base model, 
# and that we want to build 300 different decision trees
# Different in this case means that they will use different subsets of the training data

bag_cancer = BaggingClassifier(base_estimator = cancer_bag, n_estimators = 300)

# And we fit the training data
bag_cancer.fit(X_train, y_train)

pred_bag = bag_cancer.predict(X_test)
print(confusion_matrix(y_test, pred_bag))
print(classification_report(y_test, pred_bag))

In [None]:
# We use 400 trees
# By default, the number of features used in each node is the square root of the total number of columns

forest_cancer = RandomForestClassifier(n_estimators=400, random_state = 0)

# And we fit the training data
forest_cancer.fit(X_train, y_train)

# Finally look at the results

pred_forest = forest_cancer.predict(X_test)
print(confusion_matrix(y_test, pred_forest))

# As we can see, we improved even more, two misclassified cases are now corrected
print(classification_report(y_test, pred_forest))

In [None]:
# top
pd.Series(data = forest_cancer.feature_importances_, index= X_train.columns).sort_values()

In [None]:
'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 2

### Task 3.

In [None]:
# Loading the data
house_data = pd.read_csv('House_assignment.csv')
house_data.head()

In [None]:
# Similar analysis as in Task 1 can be done focusing on the column price
# What you can find that almost all the variables seem to behave similarly, as they either have 0 correlation, 
# or when they are categorical, mean of price across categories is the same
# The only varibale that somewhat reasonable to include is square meter
# It is okay if you included others, you cannot really get better results

X = house_data[['Area']]
y = house_data['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
# Build base decision tree regression model

house_tree = DecisionTreeRegressor(random_state = 42)

house_tree.fit(X_train, y_train)
house_pred = house_tree.predict(X_test)

# The MSE is around 22 million
mse_house = MSE(y_test, house_pred)
print('MSE:', mse_house)

In [None]:
# When we take the square root of MSE, it is less than 5000
# which is not bad a mistake, considering that the average error is approx. 0.1% of the mean price
# Using only one variable
100 * mse_house**0.5 / house_data.Price.mean()