In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score
import math
from sklearn.svm import SVC

#Class SVM

In [None]:
class TSVM:
	def __init__(self,X_l, y, X_u,C_l=1.0,C_u=0.001,kernel='rbf',C=1.0,gamma=1.0):

		self._X_l=X_l
		self._Y_l=y
		self._X_u=X_u
		self._C_l=C_l
		self._C_u=C_u

		self._kernel=kernel
		self._C=C
		self._gamma=gamma


		if self._kernel=='rbf':
			self._clf=SVC(C=self._C,kernel='rbf',gamma=self._gamma)
		elif self._kernel=='linear':
			self._clf=SVC(C=self._C,kernel='linear')

	def train(self):
		'''
		Train a TSVM.
		'''
		N = len(self._X_l) + len(self._X_u)
		# Initialize weights of labeled and unlabeled samples
		sample_weight = np.ones(N)
		sample_weight[len(self._X_l):] = self._C_u

		# Entraîner un SVM avec des données étiquetées
		self._clf.fit(self._X_l,self._Y_l)
	
		# Obtenir les étiquettes des échantillons non étiquetés
		self._Y_u=self._clf.predict(self._X_u)

		X_u_id=np.arange(len(self._X_u))
		# Ensemble des features
		self._X=np.vstack([self._X_l, self._X_u])
		# Ensemeble des target
		self._Y=np.concatenate((self._Y_l,self._Y_u))

		while self._C_u < self._C_l:
			# Entraîner un nouveau SVM avec des données étiquetées et non étiquetées.
			self._clf.fit(self._X, self._Y, sample_weight=sample_weight)
			while True:
				# Obtenez les distances entre les échantillons non étiquetés et l'hyperplan actuel.
				distance_Y_u = self._clf.decision_function(self._X_u)
				self._Y_u = self._Y_u.reshape(-1)
				# Calculer la marge
				epsilon = 1 - self._Y_u * distance_Y_u
				# Échantillons positifs
				positive_set, positive_id = epsilon[self._Y_u > 0], X_u_id[self._Y_u > 0]
				# Échantillons Negatifs
				negative_set, negative_id = epsilon[self._Y_u < 0], X_u_id[self._Y_u < 0]
				positive_max_id = positive_id[np.argmax(positive_set)]
				negative_max_id = negative_id[np.argmax(negative_set)]
				a, b = epsilon[positive_max_id], epsilon[negative_max_id]
				if a > 0 and b > 0 and a + b > 2.0:
					# Changer les étiquettes d'une paire d'échantillons non étiquetés
					Y2[positive_max_id] = Y2[positive_max_id] * -1
					Y2[negative_max_id] = Y2[negative_max_id] * -1
					Y3=np.concatenate((Y1,Y2))
					self._clf.fit(self._X, self._Y, sample_weight=sample_weight)
				else:
					break
			# Renew weights of unlabeled samples
			self._C_u = min(2*self._C_u, self._C_l)
			sample_weight[len(self._X_u):] = self._C_u

	def predict(self,X):
		Y = self._clf.predict(X)
		return Y     
  
	def score(self,X,y):
		score = self._clf.score(X,y)
		return score    
  
 


In [None]:
url = 'https://raw.githubusercontent.com/taojintao/TSVM-and-quasi-Newton-S3VM/master/Example%20data.csv'
data_all=pd.read_csv(url,error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
data_all.head()

Unnamed: 0,X,Y,Z,tuff,fractured zone,phyllic alteration,lava,andesite,dacite,faults,gravity inversion,magnetic inversion,sample
0,412500,4719625,250,0.880797,0.942676,0.942676,0.880797,0.880797,0.519989,0.880797,0.519989,0.382252,1
1,412900,4719875,50,0.880797,0.942676,0.942676,0.363547,0.880797,0.768525,0.672607,0.167982,0.318646,1
2,412850,4719750,100,0.880797,0.942676,0.942676,0.363547,0.880797,0.768525,0.79576,0.167982,0.519989,1
3,412600,4719825,200,0.880797,0.942676,0.942676,0.880797,0.880797,0.768525,0.880797,0.1034,0.519989,1
4,412800,4719750,75,0.880797,0.942676,0.942676,0.519989,0.880797,0.768525,0.79576,0.132389,0.58904,1


In [None]:
data_all.tail()

Unnamed: 0,X,Y,Z,tuff,fractured zone,phyllic alteration,lava,andesite,dacite,faults,gravity inversion,magnetic inversion,sample
595,414125,4720525,425,0.01292,0.013022,0.013022,0.01292,0.022754,0.26115,0.01292,0.768525,0.012818,0
596,413000,4718650,-100,0.363547,0.888944,0.179462,0.880797,0.880797,0.654753,0.672607,0.061803,0.061803,0
597,411400,4719175,500,0.01292,0.013022,0.013022,0.01292,0.01292,0.012818,0.01292,0.1034,0.167982,0
598,412900,4720625,250,0.01292,0.013022,0.013022,0.01292,0.04229,0.382252,0.363547,0.047426,0.58904,0
599,412525,4719000,-75,0.231475,0.79576,0.310026,0.077272,0.880797,0.519989,0.880797,0.167982,0.450166,0


In [None]:
data_all['sample'].value_counts()

 0    450
-1    100
 1     50
Name: sample, dtype: int64

In [None]:
data_all.isnull().sum().sum()

0

In [None]:
#Nombre de colonne 
print('Nombre de colonne : ', data_all.shape[1])
#nombre de ligne
print('Nombre de ligne : ', data_all.shape[0])

Nombre de colonne :  13
Nombre de ligne :  600


In [None]:
#Description du dataset
data_all.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X,600.0,412625.5,830.899607,411050.0,412050.0,412625.0,413225.0,414250.0
Y,600.0,4719673.0,664.87437,4718450.0,4719125.0,4719675.0,4720225.0,4720950.0
Z,600.0,295.0417,244.841177,-150.0,93.75,287.5,525.0,650.0
tuff,600.0,0.3719119,0.387612,0.01291968,0.01291968,0.1370513,0.8807971,0.8807971
fractured zone,600.0,0.3826048,0.383774,0.0130221,0.0130221,0.1794615,0.7957597,0.9426758
phyllic alteration,600.0,0.2760899,0.346051,0.0130221,0.0130221,0.04926601,0.4800107,0.9426758
lava,600.0,0.2804511,0.362312,0.01291968,0.01291968,0.02275394,0.672607,0.8807971
andesite,600.0,0.370123,0.353151,0.01291968,0.01291968,0.2314752,0.8807971,0.8807971
dacite,600.0,0.4080979,0.304312,0.01281805,0.03626372,0.5199893,0.6547535,0.7685248
faults,600.0,0.4476276,0.353703,0.01291968,0.04228977,0.5199893,0.7957597,0.8807971


In [None]:
data_all.groupby('sample').mean()

Unnamed: 0_level_0,X,Y,Z,tuff,fractured zone,phyllic alteration,lava,andesite,dacite,faults,gravity inversion,magnetic inversion
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-1,412673.0,4719639.0,602.0,0.612577,0.433059,0.250941,0.462394,0.359199,0.531193,0.660634,0.139496,0.224856
0,412612.5,4719669.0,240.611111,0.261888,0.309163,0.20858,0.191977,0.316734,0.349319,0.356986,0.202547,0.258176
1,412647.5,4719774.0,171.0,0.880797,0.942676,0.933979,0.712831,0.872469,0.690914,0.837391,0.18682,0.387905


In [None]:
data_all.corr()

Unnamed: 0,X,Y,Z,tuff,fractured zone,phyllic alteration,lava,andesite,dacite,faults,gravity inversion,magnetic inversion,sample
X,1.0,0.042354,0.023441,0.289049,0.257835,0.185631,0.081211,0.250622,0.583434,0.141888,0.111806,0.091957,-0.014863
Y,0.042354,1.0,0.074937,-0.189053,-0.232987,-0.219728,-0.223286,-0.133019,-0.208553,-0.185048,0.137053,0.106275,0.043075
Z,0.023441,0.074937,1.0,0.200541,-0.038381,-0.098655,0.052015,-0.20093,0.003963,0.047029,-0.146828,-0.128454,-0.509889
tuff,0.289049,-0.189053,0.200541,1.0,0.848864,0.781307,0.752129,0.667701,0.701844,0.604809,0.026828,0.073961,0.012026
fractured zone,0.257835,-0.232987,-0.038381,0.848864,1.0,0.845783,0.756338,0.808419,0.712194,0.641319,0.062684,0.120155,0.202404
phyllic alteration,0.185631,-0.219728,-0.098655,0.781307,0.845783,1.0,0.704868,0.665347,0.624742,0.631082,0.101685,0.116744,0.346207
lava,0.081211,-0.223286,0.052015,0.752129,0.756338,0.704868,1.0,0.710331,0.559501,0.753717,0.038304,0.007925,0.031981
andesite,0.250622,-0.133019,-0.20093,0.667701,0.808419,0.665347,0.710331,1.0,0.707873,0.628152,-0.031931,0.072251,0.251108
dacite,0.583434,-0.208553,0.003963,0.701844,0.712194,0.624742,0.559501,0.707873,1.0,0.511314,-0.032373,0.051521,0.020361
faults,0.141888,-0.185048,0.047029,0.604809,0.641319,0.631082,0.753717,0.628152,0.511314,1.0,-0.02171,0.064352,-0.017337


In [None]:
data_label=data_all[(data_all['sample']==1)|(data_all['sample']==-1)]
print("Nombre de données etiquetées du dataset：",data_label.shape)
data_unlabel=data_all[data_all['sample']==0]
print("Nombre de données non etiquetées du dataset：",data_unlabel.shape)

Nombre de données etiquetées du dataset： (150, 13)
Nombre de données non etiquetées du dataset： (450, 13)


#Features & Target

In [None]:
# Diviser l'ensemble des données étiquetés en features et target
X_label = data_label.iloc[:,3:12]
y_label = data_label.iloc[:,12]

# Divisez l'ensemble de données en ensemble de données de test et ensemble de données d'entraînement.
X_train,X_test,y_train,y_test = train_test_split(X_label,y_label,test_size = 0.8,random_state = 1)

# Considérer les données de test comme des données non étiquetées
X_unlabel=X_test
print("Nombre de données non etiquetées utilisées:",X_unlabel.shape)

Nombre de données non etiquetées utilisées: (120, 9)


#Model

In [None]:
tsvm=TSVM(X_l=X_train, y=y_train, X_u=X_unlabel,C_l=1.0,C_u=0.001,kernel='rbf',C=0.3,gamma=0.7)

In [None]:
tsvm.train()

  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [None]:
score = tsvm.score(X_train,y_train)
print('Score sur le train :' , score)

Score sur le train : 0.9666666666666667


  f"X has feature names, but {self.__class__.__name__} was fitted without"


#Prediction sur le Test

In [None]:
# Prédire les étiquettes de l'ensemble de données de test
y_pred = tsvm.predict(X_test)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [None]:
y_pred

array([ 1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
       -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,
       -1,  1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
        1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1, -1,
       -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
       -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
        1, -1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1, -1,
       -1])

In [None]:
#Precision
precision = precision_score(y_test, y_pred)
print('Precision :', precision)

Precision : 0.9090909090909091


#Prediction sur Data_all

In [None]:
new_data_all = data_all.iloc[:,3:12]
pred_data_all=tsvm.predict(new_data_all)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [None]:
#Precision
precision_all = precision_score(data_all.iloc[:,12], pred_data_all,average=None)
print('Precision :', precision)

Precision : 0.9090909090909091


  _warn_prf(average, modifier, msg_start, len(result))
