In [9]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# import train test split
from sklearn.model_selection import train_test_split
# Evaluate the model using various metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [10]:
df = sns.load_dataset('titanic')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
# impute null values
df['age'] = df['age'].fillna(df['age'].mean())
X = df[['age', 'sex']]
y = df['survived']
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# evaluating the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print('Evaluation Metrics:')
print('-------------------')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('ROC AUC Score:', roc_auc)
print('Confusion Matrix:', cm)

Evaluation Metrics:
-------------------
Accuracy: 0.7821229050279329
Precision: 0.7536231884057971
Recall: 0.7027027027027027
F1 Score: 0.7272727272727273
ROC AUC Score: 0.7703989703989704
Confusion Matrix: [[88 17]
 [22 52]]


In [12]:
# SVC
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

# evaluating the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print('Evaluation Metrics:')
print('-------------------')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('ROC AUC Score:', roc_auc)
print('Confusion Matrix:', cm)

Evaluation Metrics:
-------------------
Accuracy: 0.6089385474860335
Precision: 0.8333333333333334
Recall: 0.06756756756756757
F1 Score: 0.12500000000000003
ROC AUC Score: 0.529021879021879
Confusion Matrix: [[104   1]
 [ 69   5]]


Evaluation Metrics (of LR):
-------------------
Accuracy: 0.7821229050279329
Precision: 0.7536231884057971
Recall: 0.7027027027027027
F1 Score: 0.7272727272727273
ROC AUC Score: 0.7703989703989704


Evaluation Metrics (SVC):
-------------------
Accuracy: 0.6089385474860335
Precision: 0.8333333333333334
Recall: 0.06756756756756757
F1 Score: 0.12500000000000003
ROC AUC Score: 0.529021879021879

In [14]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [17]:
svc.n_support_

array([268, 264], dtype=int32)

----
# SVM Details

In [18]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris() #from scikit learn directly
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

In [19]:
# Standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [20]:
# Create a Support Vector Classifier with the Radial basis function (RBF) kernel
svm = SVC(kernel='rbf', C=1.0, random_state=42)

# Train the model using the training data
svm.fit(X_train_std, y_train)

In [21]:
# Make predictions using the test data
y_pred = svm.predict(X_test_std)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7821229050279329


# Detecting Outliers using SVM

In [37]:
import pandas as pd
import numpy as np
from sklearn import svm

# load Titanic dataset
df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

# drop irrelevant columns and missing values
df.drop(['Name'], axis=1, inplace=True)
df.dropna(inplace=True)

# convert categorical variables to numerical
df['Sex'] = pd.factorize(df['Sex'])[0]

# split features and target variable
X = df.drop('Survived', axis=1).values
y = df['Survived'].values

# train an SVM model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X)

# predict class labels for all data points
y_pred = clf.predict(X)

# calculate the distance of each data point from the decision boundary
distances = clf.decision_function(X)

# sort the distances in descending order
sorted_idx = np.argsort(distances)[::-1]

# set a threshold for the distance
threshold = np.percentile(distances, 1)

# identify the data points that have a distance above the threshold as outliers
outliers = df.iloc[distances < threshold]

print("Number of outliers:", len(outliers))
print("Outliers:", outliers)


Number of outliers: 9
Outliers:      Survived  Pclass  Sex   Age  Siblings/Spouses Aboard  \
225         1       2    0  19.0                        0   
235         0       2    0  44.0                        1   
237         0       2    0  19.0                        0   
269         1       3    0  25.0                        0   
489         0       1    0  55.0                        0   
659         0       1    0  47.0                        0   
818         0       1    0  38.0                        0   
851         1       3    1  18.0                        0   
881         0       3    1  39.0                        0   

     Parents/Children Aboard     Fare  
225                        0  10.5000  
235                        0  26.0000  
237                        0  10.5000  
269                        0   0.0000  
489                        0  30.5000  
659                        0  25.5875  
818                        0   0.0000  
851                        1   9.3500

## Details:
* The `OneClassSVM` is an `unsupervised learning algorithm` that is commonly used for outlier detection. It learns a boundary that separates the majority of the data from the rest, and any points that fall outside of this boundary are considered outliers.

* The `nu parameter` specifies the fraction of training data that is allowed to be classified as outliers. A lower value of nu allows more data points to be classified as outliers, while a higher value allows fewer data points to be classified as outliers.

* The `kernel parameter` specifies the type of kernel to use. The "rbf" kernel is a popular choice that uses a radial basis function to map the input features to a higher-dimensional space.

* The `gamma parameter` controls the shape of the kernel function. A higher value of gamma results in a narrower peak of the radial basis function, which can lead to overfitting if the value is too high.

# Outlier detection in tips dataset

In [29]:
import pandas as pd
import numpy as np
from sklearn import svm

# load Tips dataset
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')

# drop irrelevant columns
df.drop(['day', 'time'], axis=1, inplace=True)

# convert categorical variables to numerical
df['sex'] = pd.factorize(df['sex'])[0]
df['smoker'] = pd.factorize(df['smoker'])[0]

# split features and target variable
X = df.drop('tip', axis=1).values
y = df['tip'].values

# train an SVM model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) 
clf.fit(X)

# predict class labels for all data points
y_pred = clf.predict(X)

# calculate the distance of each data point from the decision boundary
distances = clf.decision_function(X)

# sort the distances in descending order
sorted_idx = np.argsort(distances)[::-1]

# set a threshold for the distance
threshold = np.percentile(distances, 5)

# identify the data points that have a distance above the threshold as outliers
outliers = df.iloc[distances < threshold]

print("Number of outliers:", len(outliers))
print("Outliers:", outliers)


Number of outliers: 13
Outliers:      total_bill    tip  sex  smoker  size
67         3.07   1.00    0       1     1
88        24.71   5.85    1       0     2
93        16.32   4.30    0       1     2
141       34.30   6.70    1       0     6
142       41.19   5.00    1       0     5
143       27.05   5.00    0       0     6
156       48.17   5.00    1       0     6
159       16.49   2.00    1       0     4
170       50.81  10.00    1       1     3
184       40.55   3.00    1       1     2
185       20.69   5.00    1       0     5
203       16.40   2.50    0       1     2
240       27.18   2.00    0       1     2


# SVM Regressor

In [38]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# load Tips dataset
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')

# drop irrelevant columns
df.drop(['day', 'time'], axis=1, inplace=True)

# convert categorical variables to numerical
df['sex'] = pd.factorize(df['sex'])[0]
df['smoker'] = pd.factorize(df['smoker'])[0]

# split features and target variable
X = df.drop('tip', axis=1).values
y = df['tip'].values

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train an SVM regressor
reg = svm.SVR(kernel='rbf', gamma='scale')
reg.fit(X_train, y_train)

# predict the test set and calculate mean squared error
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)


Mean Squared Error: 0.5308412202890427


In [31]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 0.7285885123230003


In [32]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)


Mean Absolute Error: 0.5624715249431942


In [33]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)


R-squared: 0.5753172884349698


In [39]:
reg.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}