In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('pulsar_stars.csv')
df.head(3)

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0


In [3]:
# EDA
df.shape

(17898, 9)

In [4]:
df.columns

Index([' Mean of the integrated profile',
       ' Standard deviation of the integrated profile',
       ' Excess kurtosis of the integrated profile',
       ' Skewness of the integrated profile', ' Mean of the DM-SNR curve',
       ' Standard deviation of the DM-SNR curve',
       ' Excess kurtosis of the DM-SNR curve', ' Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [5]:
# Remove leadinig spaces from column names
df.columns = df.columns.str.strip()

In [6]:
df.columns

Index(['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [7]:
df.columns = ['IP Mean', 'IP Sd', 'IP Kurtosis', 'IP Skewness', 'DM-SNR Mean', 'DM-SNR Sd', 'DM-SNR Kurtosis', 'DM-SNR SKewness', 'target_class']

In [8]:
# view the renamed column names
df.columns

Index(['IP Mean', 'IP Sd', 'IP Kurtosis', 'IP Skewness', 'DM-SNR Mean',
       'DM-SNR Sd', 'DM-SNR Kurtosis', 'DM-SNR SKewness', 'target_class'],
      dtype='object')

In [9]:
df['target_class'].value_counts() # It is highly imbalance.

target_class
0    16259
1     1639
Name: count, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   IP Mean          17898 non-null  float64
 1   IP Sd            17898 non-null  float64
 2   IP Kurtosis      17898 non-null  float64
 3   IP Skewness      17898 non-null  float64
 4   DM-SNR Mean      17898 non-null  float64
 5   DM-SNR Sd        17898 non-null  float64
 6   DM-SNR Kurtosis  17898 non-null  float64
 7   DM-SNR SKewness  17898 non-null  float64
 8   target_class     17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [11]:
df.isnull().sum()

IP Mean            0
IP Sd              0
IP Kurtosis        0
IP Skewness        0
DM-SNR Mean        0
DM-SNR Sd          0
DM-SNR Kurtosis    0
DM-SNR SKewness    0
target_class       0
dtype: int64

In [12]:
# Split
X = df.drop(['target_class'], axis = 1)
y = df['target_class']

In [13]:
# Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [14]:
# Balancing
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_res, y_res = smt.fit_resample(X_sc, y)

In [15]:
# Model Selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, random_state = 42)

In [16]:
# check the shape of X_train and y_test
X_train.shape, X_test.shape

((26014, 8), (6504, 8))

In [17]:
y_train.shape, y_test.shape

((26014,), (6504,))

In [18]:
# use algo
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc = SVC()

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

In [19]:
# By default accuracy
print(accuracy_score(y_test, y_pred))

0.9449569495694957


In [20]:
# Now increasing value of C
svc = SVC(C=100.0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9464944649446494


In [21]:
# We can see that we obtain a higher accuracy with C = 100.0 as higher C means less Outliers
# Now, I will further increase the value of C = 1000.0 and check accuracy.

In [22]:
# now C = 1000.0
svc = SVC(C=1000.0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9484932349323493


In [23]:
# Run SVM with linear Kernel.

linear_svc = SVC(kernel = 'linear', C=1.0)
linear_svc.fit(X_train, y_train)
y_pred_test = linear_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.9404981549815498


In [24]:
# C = 100.0

linear_svc = SVC(kernel = 'linear', C=100.0)
linear_svc.fit(X_train, y_train)
y_pred_test = linear_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.9403444034440345


In [25]:
# C = 1000.0

linear_svc = SVC(kernel = 'linear', C=1000.0)
linear_svc.fit(X_train, y_train)
y_pred_test = linear_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.9403444034440345


In [26]:
# Run SVM with Polynomial Kernel.

poly_svc = SVC(kernel = 'poly', C=1.0)
poly_svc.fit(X_train, y_train)
y_pred_test = poly_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.9183579335793358


In [27]:
# C = 100.0
poly_svc = SVC(kernel = 'poly', C=1.0)
poly_svc.fit(X_train, y_train)
y_pred_test = poly_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.9183579335793358


In [28]:
# C = 1000.0

poly_svc = SVC(kernel = 'poly', C=1000.0)
poly_svc.fit(X_train, y_train)
y_pred_test = poly_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.942650676506765


In [29]:
# Run SVM with Sigmoid Kernel.

sig_svc = SVC(kernel = 'sigmoid', C=1.0)
sig_svc.fit(X_train, y_train)
y_pred_test = sig_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.8570110701107011


In [30]:
# C = 100.0

sig_svc = SVC(kernel = 'sigmoid', C=100.0)
sig_svc.fit(X_train, y_train)
y_pred_test = sig_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.8568573185731857


In [31]:
# C = 1000.0

sig_svc = SVC(kernel = 'sigmoid', C=1000.0)
sig_svc.fit(X_train, y_train)
y_pred_test = sig_svc.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

0.8568573185731857


In [32]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

svc = SVC()

parameters = [{'C':[1,10,100,1000], 'kernel':['linear']},
             {'C':[1,10,100,1000], 'kernel': ['rbf']}, 
             {'C': [1,10,100,1000], 'kernel': ['poly']}
              ]
grid_search = GridSearchCV(estimator = svc,
                          param_grid = parameters,
                          scoring = 'accuracy',
                          cv = 5,
                          verbose = 0)

grid_search.fit(X_train, y_train)

In [33]:
# print parameters that give the best results
print('Parameters that gives the best results :', '\n\n', (grid_search.best_params_))

Parameters that gives the best results : 

 {'C': 1000, 'kernel': 'rbf'}


In [34]:
# Calculate Grid Search CV score on test set
print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(X_test, y_test)))

GridSearch CV score on test set: 0.9485


In [35]:
# It is completed ------------------------------------------------

In [36]:
########################################################

In [37]:
# Working on next dataset


In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, f1_score

In [39]:
df = pd.read_csv('winequality-red.csv')
df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [40]:
df.shape

(1599, 12)

In [41]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [42]:
df['quality'].value_counts()

quality
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

In [43]:
# These quality convert into only two quality.
df[df['quality'] <= 5] = 0
df[df['quality'] > 5] = 1

In [44]:
df['quality'].value_counts()

quality
1    855
0    744
Name: count, dtype: int64

In [45]:
from scipy.stats import zscore
z = np.abs(zscore(df))
df1 = df[(z<3).all(axis = 1)]

print('Shape of the dataframe before removing outliers: ', df.shape)
print('Shape of the dataframe after removing outliers: ', df1.shape)
print('Percentage of data loss post outlier removal: ', (df.shape[0] - df1.shape[0]) / df.shape[0] * 100)

Shape of the dataframe before removing outliers:  (1599, 12)
Shape of the dataframe after removing outliers:  (1599, 12)
Percentage of data loss post outlier removal:  0.0


In [46]:
print(df.shape)
print(df1.shape)

(1599, 12)
(1599, 12)


In [47]:
df = df1.copy()

In [48]:
# Split X and Y
X = df.drop('quality', axis = 1)
y = df['quality']

In [49]:
# Scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [50]:
# Model_selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size = 0.2, random_state = 42)

In [51]:
X_train.shape

(1279, 11)

In [52]:
X_test.shape

(320, 11)

In [53]:
# use algo
from sklearn.svm import SVC
svc_clf = SVC(C = 1.0, 
              kernel = 'rbf', 
              degree = 3 )

svc_clf.fit(X_train, y_train)

In [54]:
svc_clf_predictions = svc_clf.predict(X_test)

In [55]:
from sklearn.metrics import classification_report

print(classification_report(y_test, svc_clf_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       141
           1       1.00      1.00      1.00       179

    accuracy                           1.00       320
   macro avg       1.00      1.00      1.00       320
weighted avg       1.00      1.00      1.00       320



In [56]:
print(confusion_matrix(y_test, svc_clf_predictions))

[[141   0]
 [  0 179]]
