# Import Libaries

In [75]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import pyreadr
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import seaborn as sns
from keras import regularizers
from matplotlib import pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from numpy import concatenate
from sklearn.neighbors import LocalOutlierFactor
from keras.models import Model, load_model, Sequential
from keras.layers import Input, Dense
import collections
from sklearn.linear_model import LogisticRegression
from semisupervised import S3VM
from sklearn.cluster import KMeans
from sklearn.semi_supervised import LabelPropagation

In [45]:
# Importing the dataset
dataset = pd.read_csv('credit_approval.csv')
dataset.head(5)

Unnamed: 0,C1,N2,N3,C4_enc,C5_enc,C6_enc,N7,C8,C9,N10,C11,C12_enc,N13,N14,Target
0,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.0,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.75,1,4,4,1.25,0,0,0,1,2,280,1,0
3,0,21.67,11.5,1,5,3,0.0,1,1,11,1,2,0,1,1
4,1,20.17,8.17,2,6,4,1.96,1,1,14,0,2,60,159,1


In [46]:
dataset['New Label'] = dataset['Target']
#create unlabeled data
dataset.loc[100:, 'New Label'] = -1

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [47]:
dataset["New Label"].value_counts()

-1    590
 0     56
 1     44
Name: New Label, dtype: int64

# Semi Supervised Learning
Semi-supervised learning is the branch of machine learning concerned with using labelled
as well as unlabelled data to perform certain learning tasks. Generally, the main intrest of research on semi-supervised learning is focused on classification. Semi-supervised classification methods are
particularly relevant to scenarios where labelled data is scarce. In those cases, it may be
difficult to construct a reliable supervised classifier. This situation occurs in application
domains where labelled data is expensive or difficult obtain, like computer-aided diagnosis,
drug discovery and part-of-speech tagging. If sufficient unlabelled data is available and under
certain assumptions about the distribution of the data, the unlabelled data can help in the
construction of a better classifier.

## Inductive
**Definition:** Induction is reasoning from observed training cases to general rules, which are then applied to the test cases.

Inductive learning is the same as what we commonly know as traditional supervised learning. We build and train a machine learning model based on a labelled training dataset we already have. Then we use this trained model to predict the labels of a testing dataset which we have never encountered before.

### Wrapper Methods
A simple approach to extending existing, supervised algorithms to the semi-supervised setting
is to first train classifiers on labelled data, and to then use the predictions of the resulting
classifiers to generate additional labelled data. The classifiers can then be re-trained on this
pseudo-labelled data in addition to the existing labelled data. Such methods are known as
wrapper methods: the unlabelled data is pseudo-labelled by a wrapper procedure, and a
purely supervised learning algorithm, unaware of the distinction between originally labelled
and pseudo-labelled data, constructs the final inductive classifier.

#### Self Training
Self-training methods consist of a single supervised
classifier that is iteratively trained on both labelled data and data that has been pseudo-labelled
in previous iterations of the algorithm. At the beginning of the self-training procedure, a supervised classifier is trained on only the labelled data. The resulting classifier is used to obtain predictions for the unlabelled data points. Then, the most confident of these predictions are added to the labelled data set, and the supervised classifier is re-trained on both the original labelled data and the newly obtained pseudo-labelled data. This procedure is typically iterated until no more unlabelled data remain.

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [49]:
gnb = GaussianNB()
self_training_model = SelfTrainingClassifier(gnb)

In [87]:
y_true = X_test[:, -1] ## target column
X_test = X_test[:,:-1] ## drop target column
X_train = X_train[:,:-1] ## drop target column

#Before we fit any models, we need to scale our features: this ensures all features 
#are on the same numerical scale
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [51]:
self_training_model.fit(X_train, y_train)
y_pred=self_training_model.predict(X_test)

In [52]:
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.89      0.82       115
         1.0       0.82      0.64      0.72        92

    accuracy                           0.78       207
   macro avg       0.79      0.76      0.77       207
weighted avg       0.78      0.78      0.77       207



### Unsupervised Preprocessing 


Unsupervised preprocessing use the
unlabelled data and labelled data in two separate stages. Typically, the unsupervised stage comprises either the automated extraction or transformation of sample features from the unlabelled data (feature extraction), the unsupervised clustering of the data (cluster-then-label), or the initialization of the parameters of the learning procedure (pre-training).

#### Cluster then Label

Many semi-supervised learning algorithms use principles from clustering to
guide the classification process. Cluster-then-label approaches usually apply an unsupervised or
semi-supervised clustering algorithm to all available data, and use the resulting clusters to
guide the classification process. In our case, we cluster both labelled and unlabelled coming from train dataset and then we use the mojority of labelled instances in each cluster to assign a label in the whole cluster. That way, training dataset which previously contained both labelled and unlabelled data, now contains only labelled data. In this dataset we apply a supervised learning method (Logistic Regression) and then based on that learner we predict our test data originating from the initial splitting.

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

y_true = X_test[:, -1] ## target column
X_test = X_test[:,:-1] ## drop target column
X_train = X_train[:,:-1] ## drop target column

#Before we fit any models, we need to scale our features: this ensures all features 
#are on the same numerical scale
train_scaler = preprocessing.StandardScaler().fit(X_train)
X_train = train_scaler.transform(X_train)
X_test = train_scaler.transform(X_test)
X_train.shape

(483, 14)

In [55]:
#UNSUPERVISED PREPROCESSING 
clustering = KMeans(n_clusters=2, random_state=42,n_jobs=-1)
clustering.fit(X_train)

# apply the labels
train_labels = clustering.labels_

df = X_train.copy()
column_names = list(dataset.columns)
column_names.pop(-1) #remove New Label column name
column_names.pop(-1) #remove Target column name

df = pd.DataFrame(X_train , columns = column_names)

df['clusters'] = train_labels

df["true"] = y_train

df.head(5)

Unnamed: 0,C1,N2,N3,C4_enc,C5_enc,C6_enc,N7,C8,C9,N10,C11,C12_enc,N13,N14,clusters,true
0,-1.394676,1.160665,-0.736898,0.594187,0.991905,-0.343704,-0.669543,-1.053172,-0.844255,-0.477545,1.093368,0.25333,0.473758,-0.239587,1,-1
1,-1.394676,2.042573,0.497563,0.594187,-0.932152,1.656403,1.532486,0.949513,1.184477,0.123117,1.093368,0.25333,-0.450387,-0.227492,0,0
2,-1.394676,-0.637853,-0.838416,0.594187,0.991905,1.656403,-0.295613,0.949513,1.184477,-0.277325,-0.914605,0.25333,-1.017347,-0.075573,0,-1
3,0.717012,-0.72926,-0.787657,0.594187,-0.107556,-0.343704,-0.5449,0.949513,-0.844255,-0.477545,1.093368,-3.053649,0.796925,-0.239587,1,-1
4,-1.394676,-1.342872,-0.863796,0.594187,0.167309,-0.343704,-0.378709,-1.053172,-0.844255,-0.477545,-0.914605,0.25333,-0.336995,-0.235233,1,-1


In [56]:
df0=df[df.clusters==0] #cluster 0
df1=df[df.clusters==1] #cluster 1

In [57]:
df0["true"].value_counts()  #most appearances from class 1

-1    158
 1     19
 0      6
Name: true, dtype: int64

In [58]:
df1["true"].value_counts() #most appearances from class 0

-1    263
 0     32
 1      5
Name: true, dtype: int64

In [59]:
df.loc[df['clusters'] == 0, 'predicted'] = 1
df.loc[df['clusters'] == 1, 'predicted'] = 0
df = df.drop('clusters',axis=1)
df = df.drop('true',axis=1)

In [63]:
df.head(5)

Unnamed: 0,C1,N2,N3,C4_enc,C5_enc,C6_enc,N7,C8,C9,N10,C11,C12_enc,N13,N14,predicted
0,-1.394676,1.160665,-0.736898,0.594187,0.991905,-0.343704,-0.669543,-1.053172,-0.844255,-0.477545,1.093368,0.25333,0.473758,-0.239587,0.0
1,-1.394676,2.042573,0.497563,0.594187,-0.932152,1.656403,1.532486,0.949513,1.184477,0.123117,1.093368,0.25333,-0.450387,-0.227492,1.0
2,-1.394676,-0.637853,-0.838416,0.594187,0.991905,1.656403,-0.295613,0.949513,1.184477,-0.277325,-0.914605,0.25333,-1.017347,-0.075573,1.0
3,0.717012,-0.72926,-0.787657,0.594187,-0.107556,-0.343704,-0.5449,0.949513,-0.844255,-0.477545,1.093368,-3.053649,0.796925,-0.239587,0.0
4,-1.394676,-1.342872,-0.863796,0.594187,0.167309,-0.343704,-0.378709,-1.053172,-0.844255,-0.477545,-0.914605,0.25333,-0.336995,-0.235233,0.0


In [61]:
# use supervised techniques to train the data derived fron unsupervised prepocessing
y_train = df['predicted'] ## target column
x_train = df.drop('predicted',axis=1)
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
y_pred = logisticRegr.predict(X_test)

In [62]:
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.91      0.85       115
         1.0       0.87      0.72      0.79        92

    accuracy                           0.83       207
   macro avg       0.83      0.82      0.82       207
weighted avg       0.83      0.83      0.82       207



### Intrinsically semi-supervised method

#### Support Vector Machines

The objective of an SVM is to find a decision boundary that maximizes the margin, which
is defined as the distance between the decision boundary and the data points closest to it.

The concept of semi-supervised SVMs, or S3VMs, is similar: we want to maximize the
margin, and we want to correctly classify the labelled data. However, in the semi-supervised
setting, an additional objective becomes relevant: we also want to minimize the number of
unlabelled data points that violate the margin. Since the labels of the unlabelled data points are
unknown, those that violate (i.e. lie within) the margin are penalized based on their distance
to the closest margin boundary. 

S3VMs were proposed by Vapnik (1998), who motivated the problem from a more transductive viewpoint: instead of optimizing only over the weight vector, bias and slack variables, he proposed to also optimize over the label predictions $y^{U}$ . This formulation is equivalent to optimization problem,
since any labelling $yˆ{U}$ can only be optimal if, for each $y^{i}$ ∈ $y^{U}$ , $x_{i}$ is on the correct side of the decision boundary. Otherwise, a better solution could be obtained by simply inverting the labelling of $x_{i}$ .

In [73]:
labeled = dataset.loc[dataset['New Label'] != -1]
unlabeled = dataset.loc[dataset['New Label'] == -1]

X_labeled = labeled.iloc[:, :-1].values
y_labeled = labeled.iloc[:, -1].values

X_unlabeled = unlabeled.iloc[:, :-2].values
y_unlabeled = unlabeled.iloc[:, -1].values

label_X_train, label_X_test, label_y_train, label_y_test = train_test_split(X_labeled, y_labeled, test_size=0.3, random_state=1)
label_X_test = label_X_test[:,:-1] ## drop target column
label_X_train = label_X_train[:,:-1] ## drop target column

#Before we fit any models, we need to scale our features: this ensures all features 
#are on the same numerical scale
train = np.append(label_X_train, X_unlabeled, axis = 0)
train_scaler = preprocessing.StandardScaler().fit(train)
label_X_train = train_scaler.transform(label_X_train)
X_unlabeled = train_scaler.transform(X_unlabeled)
label_X_test = train_scaler.transform(label_X_test)
label_X_train.shape

(70, 14)

In [74]:
model = S3VM()

model.fit(np.concatenate((label_X_train, X_unlabeled)), np.append(label_y_train, y_unlabeled))
# predict
predict = model.predict(label_X_test)

print (classification_report(label_y_test, predict))

              precision    recall  f1-score   support

           0       0.54      1.00      0.70        15
           1       1.00      0.13      0.24        15

    accuracy                           0.57        30
   macro avg       0.77      0.57      0.47        30
weighted avg       0.77      0.57      0.47        30



## Transductive

In [83]:
X_train_mixed, X_test_mixed, y_train_mixed, y_test_mixed = train_test_split(X, y, test_size=0.3, random_state=1)

y_true = X_test_mixed[:,-1]
X_train_mixed = X_train_mixed[:,:-1] ## drop target column
X_test_mixed = X_test_mixed[:,:-1] ## drop target column

#Before we fit any models, we need to scale our features: this ensures all features 
#are on the same numerical scale
train_scaler = preprocessing.StandardScaler().fit(X_train_mixed)
X_train_mixed = train_scaler.transform(X_train_mixed)
X_test_mixed = train_scaler.transform(X_test_mixed)

In [82]:
lp = LabelPropagation(gamma=.25)
lp.fit(X_train_mixed, y_train_mixed)
# get labels for entire training dataset data
tran_labels = lp.transduction_

In [80]:
tran_labels.shape

(483,)

In [85]:
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train_mixed, tran_labels)
y_pred = logisticRegr.predict(X_test_mixed)

In [86]:
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

         0.0       0.64      0.98      0.77       115
         1.0       0.93      0.30      0.46        92

    accuracy                           0.68       207
   macro avg       0.79      0.64      0.62       207
weighted avg       0.77      0.68      0.63       207

