In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import neighbors
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# CS3033/CS6405 - Data Mining - Second Assignment

### Submission

This assignment is **due on 06/04/22 at 23:59**. You should submit a single .ipnyb file with your python code and analysis electronically via Canvas.
Please note that this assignment will account for 25 Marks of your module grade.

### Declaration

By submitting this assignment. I agree to the following:

<font color="red">“I have read and understand the UCC academic policy on plagiarism, and agree to the requirements set out thereby in relation to plagiarism and referencing. I confirm that I have referenced and acknowledged properly all sources used in the preparation of this assignment.
I declare that this assignment is entirely my own work based on my personal study. I further declare that I have not engaged the services of another to either assist me in, or complete this assignment”</font>

### Objective

The Boolean satisfiability (SAT) problem consists in determining whether a Boolean formula F is satisfiable or not. F is represented by a pair (X, C), where X is a set of Boolean variables and C is a set of clauses in Conjunctive Normal Form (CNF). Each clause is a disjunction of literals (a variable or its negation). This problem is one of the most widely studied combinatorial problems in computer science. It is the classic NP-complete problem. Over the past number of decades, a significant amount of research work has focused on solving SAT problems with both complete and incomplete solvers.

Recent advances in supervised learning have provided powerful techniques for classifying problems. In this project, we see the SAT problem as a classification problem. Given a Boolean formula (represented by a vector of features), we are asked to predict if it is satisfiable or not.

In this project, we represent SAT problems with a vector of 327 features with general information about the problem, e.g., number of variables, number of clauses, fraction of horn clauses in the problem, etc. There is no need to understand the features to be able to complete the assignment.

The dataset is available at:
https://github.com/andvise/DataAnalyticsDatasets/blob/main/dm_assignment2/sat_dataset_train.csv

This is original unpublished data.

## Data Preparation

In [None]:
import pandas as pd

df = pd.read_csv("https://github.com/andvise/DataAnalyticsDatasets/blob/6d5738101d173b97c565f143f945dedb9c42a400/dm_assignment2/sat_dataset_train.csv?raw=true")
df.head()

Unnamed: 0,c,v,clauses_vars_ratio,vars_clauses_ratio,vcg_var_mean,vcg_var_coeff,vcg_var_min,vcg_var_max,vcg_var_entropy,vcg_clause_mean,...,rwh_0_max,rwh_1_mean,rwh_1_coeff,rwh_1_min,rwh_1_max,rwh_2_mean,rwh_2_coeff,rwh_2_min,rwh_2_max,target
0,420,10,42.0,0.02381,0.6,0.0,0.6,0.6,0.0,0.6,...,78750.0,8e-06,0.0,7.875e-06,8e-06,2.385082e-21,0.0,2.385082e-21,2.385082e-21,1
1,230,20,11.5,0.086957,0.137826,0.089281,0.117391,0.16087,2.180946,0.137826,...,6646875.0,17433.722184,1.0,2.981244e-12,34867.444369,17277.21,1.0,1.358551e-53,34554.42,0
2,240,16,15.0,0.066667,0.3,0.0,0.3,0.3,0.0,0.3,...,500000.0,1525.878932,0.0,1525.879,1525.878932,1525.879,0.0,1525.879,1525.879,1
3,424,30,14.133333,0.070755,0.226415,0.485913,0.056604,0.45283,2.220088,0.226415,...,87500.0,0.000122,1.0,6.535723e-14,0.000245,8.218628e-07,1.0,1.499676e-61,1.643726e-06,0
4,162,19,8.526316,0.117284,0.139701,0.121821,0.111111,0.185185,1.940843,0.139701,...,5859400.0,16591.49431,1.0,6.912725999999999e-42,33182.988621,16659.03,1.0,0.0,33318.07,1


In [None]:
df.dtypes

c                       int64
v                       int64
clauses_vars_ratio    float64
vars_clauses_ratio    float64
vcg_var_mean          float64
                       ...   
rwh_2_mean            float64
rwh_2_coeff           float64
rwh_2_min             float64
rwh_2_max             float64
target                  int64
Length: 328, dtype: object

In [None]:
df['target'].value_counts()

1    976
0    953
Name: target, dtype: int64

In [None]:
inf_values = np.isinf(df).values.sum()
inf_values
#There are infinity values

649

In [None]:
miss_values = np.isnan(df).values.sum()
miss_values
#There are missing values

15808

In [None]:
df = df.fillna(np.nan)
df = df.replace([np.inf, -np.inf],np.nan)
Imp_Mean = SimpleImputer(missing_values = np.nan, strategy= 'mean')
Imp_Mean = Imp_Mean.fit(df)
df = Imp_Mean.transform(df)
df = pd.DataFrame(df)

In [None]:
target = df.iloc[:,-1:]
feature = df.iloc[:,:-1]

# Tasks

## Basic models and evaluation (5 Marks)

Using Scikit-learn, train and evaluate K-NN and decision tree classifiers using 70% of the dataset from training and 30% for testing. For this part of the project, we are not interested in optimising the parameters; we just want to get an idea of the dataset. Compare the results of both classifiers.

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(feature, target, train_size = 0.7, random_state = 121100898)
ytrain = ytrain.values.ravel()

In [None]:
knn = neighbors.KNeighborsClassifier()
knn.fit(xtrain, ytrain)
predicted_values = knn.predict(xtest)
knn_accuracy = accuracy_score(ytest,predicted_values)
print("Accuracy:", knn_accuracy)


Accuracy: 0.8756476683937824


In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(xtrain,ytrain)

predicted_values = clf.predict(xtest)
print("Accuracy:", accuracy_score(ytest, predicted_values))

Accuracy: 0.9827288428324698


The decision tree seems to be performing better than knn as it has accuracy of 98.1% and knn has accuracy of 87.5%. Applying weights to data or scaling might improve knn whereas for decision tree we do not need scaling. 

## Robust evaluation (10 Marks)

In this section, we are interested in more rigorous techniques by implementing more sophisticated methods, for instance:
* Hold-out and cross-validation.
* Hyper-parameter tuning.
* Feature reduction.
* Feature normalisation.

Your report should provide concrete information of your reasoning; everything should be well-explained.

Do not get stressed if the things you try do not improve the accuracy. The key to geting good marks is to show that you evaluated different methods and that you correctly selected the configuration.

In [None]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(xtrain)
x_test = scaler.transform(xtest)

KNN Classifer 

In [None]:
knn = neighbors.KNeighborsClassifier()
parameters = {'n_neighbors': [3, 5, 7, 11, 13, 15], 'algorithm' : ['ball_tree', 'kd_tree'], 'leaf_size' : [30,35,40,45,50]}

gridSearch = model_selection.GridSearchCV(knn, parameters)
gridSearch.fit(x_train, ytrain)

print("Classifier : ", gridSearch.best_estimator_)
print("Parameters : ",gridSearch.best_params_)
print("Accuracy  : ",gridSearch.best_score_)

print(" Test set accuracy :",gridSearch.best_estimator_.score(x_test, ytest))

Classifier :  KNeighborsClassifier(algorithm='ball_tree', n_neighbors=3)
Parameters :  {'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 3}
Accuracy  :  0.8962962962962964
 Test set accuracy : 0.8911917098445595


Decision Tree

In [None]:
clf = DecisionTreeClassifier()
parameters = {'criterion':['gini', 'entropy'] , 'max_depth':[10,15,20,25,30,35,40], 'max_leaf_nodes' : [5,10,15,20,25,30,35,40,45,50]}

grid_search = model_selection.GridSearchCV(clf, parameters)
grid_search.fit(x_train, ytrain)

print("Parameters chosen : ",grid_search.best_params_)
print("Accuracy : ",grid_search.best_score_)

print("test set accuracy : ",grid_search.best_estimator_.score(x_test, ytest))

Parameters chosen :  {'criterion': 'gini', 'max_depth': 15, 'max_leaf_nodes': 25}
Accuracy :  0.9844444444444445
test set accuracy :  0.9775474956822107


Principal Component Analysis

In [None]:
pca = PCA(n_components=4)
x__train = pca.fit_transform(x_train)
x__test = pca.transform(x_test)

knn = neighbors.KNeighborsClassifier()
parameters = {'n_neighbors': [3, 5, 7, 11, 13, 15], 'leaf_size' : [30,35,40,45,50]}

gridSearch_knn = model_selection.GridSearchCV(knn, parameters)
gridSearch_knn.fit( x__train, ytrain)

print("Classifier :", gridSearch_knn.best_estimator_)
print("Accuracy :",gridSearch_knn.best_score_)
print("Parameters :",gridSearch_knn.best_params_)

print("Test set accuracy : ",gridSearch_knn.best_estimator_.score(x__test, ytest))


Classifier : KNeighborsClassifier(n_neighbors=3)
Accuracy : 0.8770370370370368
Parameters : {'leaf_size': 30, 'n_neighbors': 3}
Test set accuracy :  0.8773747841105354


In [None]:
#pca on Decision tree on scaled data
clf = DecisionTreeClassifier()
parameters = {'criterion':['gini', 'entropy'] , 'max_depth':[10,15,20,25,30,35,40], 'max_leaf_nodes' : [5,10,15,20,25,30,35,40,45,50]}

gridSearch_dt = model_selection.GridSearchCV(clf,parameters)
gridSearch_dt.fit(x__train, ytrain)

print("Classifier :", gridSearch_dt.best_estimator_)
print("Accuracy :",gridSearch_dt.best_score_)
print("Parameters :",gridSearch_dt.best_params_)

print("Accuracy for test set : ",gridSearch_dt.best_estimator_.score(x__test,ytest))

Classifier : DecisionTreeClassifier(max_depth=15, max_leaf_nodes=50)
Accuracy : 0.8570370370370372
Parameters : {'criterion': 'gini', 'max_depth': 15, 'max_leaf_nodes': 50}
Accuracy for test set :  0.8255613126079447


Decision tree has better accuracy than kNN of 98.4 whereas the accuracy of knn is 89.1. Principal Component Analysis for scaled hasn't helped. It has reduced the accuracy of both kNN and decision tree by a fairly big amount. We could try using other techniques like recursive feature elimination to combat with high dimensionality of the data.
NOTE : Decision tree was performed on scaled data because it takes less time to run. Ideally, for decision tree we don't need to scale the data.

## New classifier (10 Marks)

Replicate the previous task for a classifier that we did not cover in class. So different than K-NN and decision trees. Briefly describe your choice.
Try to create the best model for the given dataset.
Save your best model into your github. And create a single code cell that loads it and evaluate it on the following test dataset:
https://github.com/andvise/DataAnalyticsDatasets/blob/main/dm_assignment2/sat_dataset_test.csv

This link currently contains a sample of the training set. The real test set will be released after the submission. I should be able to run the code cell independently, load all the libraries you need as well.

A gradient boosting model has been fitted to the data because decision tree is performing well and gradeint boosting uses an ensemble of decision trees for classification. It is giving an accuracy of 99.3 which is the best of all. It takes a while to run. Data has been scaled using min max scaler.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

Gradeint boosting

In [None]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(xtrain)
x_test = scaler.transform(xtest)

In [None]:
#Parameters for the grid search for gradient boosting model
parameters = {
    'loss' : ['deviance', 'exponential'],
    'criterion' : [ 'squared_error', 'mae' ]
}
#Defining the classifer
gbm = GradientBoostingClassifier()
#Performing grid search
gridSearch = GridSearchCV(estimator = gbm, param_grid = parameters, cv = 3, n_jobs = -1)
gridSearch.fit(x_train, ytrain)

predicted = gridSearch.predict(x_test)

print("Classifier :", gridSearch.best_estimator_)
print("Accuracy :",gridSearch.best_score_)
print("Parameters chosen :",gridSearch.best_params_)

print("Accuracy for test data : ",gridSearch.best_estimator_.score(x_test, ytest))

Classifier : GradientBoostingClassifier(criterion='squared_error')
Accuracy : 0.994074074074074
Parameters chosen : {'criterion': 'squared_error', 'loss': 'deviance'}
Accuracy for test data :  0.9930915371329879


In [None]:
from sklearn.metrics import accuracy_score
#Unscaled data
gbm1 = GradientBoostingClassifier(criterion = 'squared_error', loss = 'exponential')
gbm1.fit(xtrain, ytrain)
predictions = gbm1.predict(xtest)
accuracy_score(ytest, predictions)

0.9896373056994818

In [None]:
gbm_final = GradientBoostingClassifier()
print(xtrain.shape)

gbm_final.fit(xtrain, ytrain)
predictions = gbm_final.predict(xtest)
gbm_accuracy = accuracy_score(ytest,predictions)
print(gbm_accuracy)

(1350, 323)
0.9913644214162349


# <font color="blue">FOR GRADING ONLY</font>

Save your best model into your github. And create a single code cell that loads it and evaluate it on the following test dataset: 
https://github.com/andvise/DataAnalyticsDatasets/blob/main/dm_assignment2/sat_dataset_test.csv

In [None]:
import os
from getpass import getpass
import urllib

In [None]:
!git clone https://github.com/Aparna-K28/DM_Final.git

Cloning into 'DM_Final'...


In [None]:
%cd DM_Final

/content/Data_Mining/Data_Mining/Data_Mining/Data_Mining/Data_Mining/Data_Mining/Data_Mining/DM_Final


In [None]:
!git init

Reinitialized existing Git repository in /content/Data_Mining/Data_Mining/Data_Mining/Data_Mining/Data_Mining/Data_Mining/Data_Mining/DM_Final/.git/


In [None]:
!git config --global user.email "aparnak2800@gmail.com"
!git config --global user.name "Aparna-K28"

In [None]:
from joblib import dump, load
dump(gbm_final, 'f_model.joblib')

['f_model.joblib']

In [None]:
!git add f_model.joblib

In [None]:
!git commit -m "First Commit"

[master (root-commit) bc0087c] First Commit
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 f_model.joblib


In [None]:
user = input('Username : ')
password = getpass('Password : ')
password = urllib.parse.quote(password)
repo_name = input('Repository : ')

Username : Aparna-K28
Password : ··········
Repository : DM_Final


In [None]:
cmd_string = 'git remote set-url origin https://{0}:{1}@github.com/{0}/{2}.git'.format(user, password, repo_name)
os.system(cmd_string)

0

In [None]:
!git push origin master

Counting objects: 3, done.
Delta compression using up to 2 threads.
Compressing objects:  50% (1/2)   Compressing objects: 100% (2/2)   Compressing objects: 100% (2/2), done.
Writing objects:  33% (1/3)   Writing objects:  66% (2/3)   Writing objects: 100% (3/3)   Writing objects: 100% (3/3), 52.51 KiB | 4.38 MiB/s, done.
Total 3 (delta 0), reused 0 (delta 0)
To https://github.com/Aparna-K28/DM_Final.git
 * [new branch]      master -> master


In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import neighbors
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from io import BytesIO
import requests
from joblib import dump, load

scaler = MinMaxScaler()
df = pd.read_csv("https://github.com/andvise/DataAnalyticsDatasets/blob/main/dm_assignment2/sat_dataset_test.csv?raw=true")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.fillna(np.nan)
imput = SimpleImputer(missing_values=np.nan, strategy='mean')
imput = imput.fit(df)

df_modified = pd.DataFrame(imput.transform(df))

target = df_modified.iloc[:,-1:]
features= df_modified.iloc[:,:-1]
print(features.shape)


mLink = 'https://github.com/Aparna-K28/DM_Final/blob/bc0087c40a010185362774807b041397748b6f6b/f_model.joblib?raw=true'
mfile = BytesIO(requests.get(mLink).content)
model = load(mfile)

predicted = model.predict(features)
print("Accuracy: ", accuracy_score(target, predicted))

(483, 323)
Accuracy:  0.9917184265010351
