In [32]:
# import the libraries 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import preprocessing, metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
sns.set(style="white", color_codes=True)
import warnings
warnings.filterwarnings("ignore")

Question-1: Principal Component Analysis


In [5]:

df = pd.read_csv("/content/CC GENERAL.csv")
df.head()

#Reading the CC General file

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [6]:
df.isnull().any()
# checking null data in the dataset using isnull() function

CUST_ID                             False
BALANCE                             False
BALANCE_FREQUENCY                   False
PURCHASES                           False
ONEOFF_PURCHASES                    False
INSTALLMENTS_PURCHASES              False
CASH_ADVANCE                        False
PURCHASES_FREQUENCY                 False
ONEOFF_PURCHASES_FREQUENCY          False
PURCHASES_INSTALLMENTS_FREQUENCY    False
CASH_ADVANCE_FREQUENCY              False
CASH_ADVANCE_TRX                    False
PURCHASES_TRX                       False
CREDIT_LIMIT                         True
PAYMENTS                            False
MINIMUM_PAYMENTS                     True
PRC_FULL_PAYMENT                    False
TENURE                              False
dtype: bool

In [8]:
df.fillna(df.mean(), inplace=True)
df.isnull().any()

#replacing the null data with the mean

CUST_ID                             False
BALANCE                             False
BALANCE_FREQUENCY                   False
PURCHASES                           False
ONEOFF_PURCHASES                    False
INSTALLMENTS_PURCHASES              False
CASH_ADVANCE                        False
PURCHASES_FREQUENCY                 False
ONEOFF_PURCHASES_FREQUENCY          False
PURCHASES_INSTALLMENTS_FREQUENCY    False
CASH_ADVANCE_FREQUENCY              False
CASH_ADVANCE_TRX                    False
PURCHASES_TRX                       False
CREDIT_LIMIT                        False
PAYMENTS                            False
MINIMUM_PAYMENTS                    False
PRC_FULL_PAYMENT                    False
TENURE                              False
dtype: bool

In [9]:
x = df.iloc[:,1:-1]
y = df.iloc[:,-1]
print(x.shape,y.shape)



(8950, 16) (8950,)


a. Apply PCA on CC dataset

In [10]:
# a. Apply PCA on CC dataset.
#Datasets can be analyzed with PCA so that redundant features can be removed without losing too much information.
pca = PCA(3)   #Instantiate PCA
x_pca = pca.fit_transform(x)
principalDf = pd.DataFrame(data = x_pca, columns = ['principal component 1', 'principal component 2', 'principal component 3'])
finalDf = pd.concat([principalDf, df.iloc[:,-1]], axis = 1)
finalDf.head()

#PCA(3)- performs principal component analysis (PCA) on dataset x, reducing the dimensionality of the data from the original number of features to 3 principal components.
#fit_transform()- method of the PCA object is called on the data x to obtain a transformed version of the data, where each observation is represented by its three principal components.
#principalDf- represents the transformed data x_pca and three principal components
#finalDf- concatenating principalDf with the last column of the original DataFrame df using pd.concat(). This is likely done to include the target variable (the variable being predicted) with the transformed data.


Unnamed: 0,principal component 1,principal component 2,principal component 3,TENURE
0,-4326.383979,921.566882,183.708383,12
1,4118.916665,-2432.846346,2369.969289,12
2,1497.907641,-1997.578694,-2125.631328,12
3,1394.548536,-1488.743453,-2431.799649,12
4,-3743.351896,757.342657,512.476492,12


In [11]:
# b. Apply k-means algorithm on the PCA result and report your observation if the silhouette score has improved or not?
X = finalDf.iloc[:,0:-1]
y = finalDf.iloc[:,-1]
print(X.shape,y.shape)

#X- predictor variable- contains all rows of finalDf except for the last column, representing the principal components generated by PCA
#y- target variable- contains only the last column of finalDf, representing the target variable.


(8950, 3) (8950,)


In [12]:
nclusters = 3 # this is the k in kmeans
km = KMeans(n_clusters=nclusters)
km.fit(X)

# predict the cluster for each data point
y_cluster_kmeans = km.predict(X)

# Summary of the predictions made by the classifier
print(classification_report(y, y_cluster_kmeans, zero_division=1))
print(confusion_matrix(y, y_cluster_kmeans))

#finding the accuracy
train_accuracy = accuracy_score(y, y_cluster_kmeans)
print("\nAccuracy for our Training dataset with PCA:", train_accuracy)

#Calculating sihouette Score
score = metrics.silhouette_score(X, y_cluster_kmeans)
print("Sihouette Score: ",score)   #ranges from -1 to +1, high value shows that it is matched more



              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           6       1.00      0.00      0.00     204.0
           7       1.00      0.00      0.00     190.0
           8       1.00      0.00      0.00     196.0
           9       1.00      0.00      0.00     175.0
          10       1.00      0.00      0.00     236.0
          11       1.00      0.00      0.00     365.0
          12       1.00      0.00      0.00    7584.0

    accuracy                           0.00    8950.0
   macro avg       0.70      0.30      0.00    8950.0
weighted avg       1.00      0.00      0.00    8950.0

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [  28    1  175    0    0    0    0    0    0    0]
 [  15    2  173    0    0   

c. Perform Scaling+PCA+K-Means and report performance.

In [13]:


x = df.iloc[:,1:-1]
y = df.iloc[:,-1]
print(x.shape,y.shape)


(8950, 16) (8950,)


In [14]:
## Scale the dataset; This is very important before you apply PCA
scaler = StandardScaler()
scaler.fit(x)
X_scaled_array = scaler.transform(x)

# Instantiate PCA
pca = PCA(3)

# Determine transformed features
x_pca = pca.fit_transform(X_scaled_array)
principalDf = pd.DataFrame(data = x_pca, columns = ['principal component 1', 'principal component 2','principal component 3'])
finalDf = pd.concat([principalDf, df.iloc[:,-1]], axis = 1)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,TENURE
0,-1.718893,-1.07294,0.535659,12
1,-1.169306,2.509321,0.628088,12
2,0.938414,-0.3826,0.161156,12
3,-0.907502,0.045859,1.521731,12
4,-1.63783,-0.684975,0.425621,12


In [15]:
x = finalDf.iloc[:,0:-1]
y = finalDf["TENURE"]
print(X.shape,y.shape)

(8950, 3) (8950,)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.34,random_state=0)
nclusters = 3 
# this is the k in kmeans
km = KMeans(n_clusters=nclusters)
km.fit(X_train,y_train)


# predict the cluster for each training data point
y_clus_train = km.predict(X_train)

# Summary of the predictions made by the classifier
print(classification_report(y_train, y_clus_train, zero_division=1))
print(confusion_matrix(y_train, y_clus_train))

train_accuracy = accuracy_score(y_train, y_clus_train)
print("Accuracy for our Training dataset with PCA:", train_accuracy)

#Calculating sihouette Score
score = metrics.silhouette_score(X_train, y_clus_train)
print("Sihouette Score: ",score)   #ranges from -1 to +1, high value shows that it is matched more

              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           6       1.00      0.00      0.00     139.0
           7       1.00      0.00      0.00     135.0
           8       1.00      0.00      0.00     128.0
           9       1.00      0.00      0.00     118.0
          10       1.00      0.00      0.00     151.0
          11       1.00      0.00      0.00     262.0
          12       1.00      0.00      0.00    4974.0

    accuracy                           0.00    5907.0
   macro avg       0.70      0.30      0.00    5907.0
weighted avg       1.00      0.00      0.00    5907.0

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 105    4   30    0    0    0    0    0    0    0]
 [ 108    1   26    0    0   

In [17]:
# predict the cluster for each testing data point
y_clus_test = km.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(y_test, y_clus_test, zero_division=1))
print(confusion_matrix(y_test, y_clus_test))

train_accuracy = accuracy_score(y_test, y_clus_test)
print("\nAccuracy for our Testing dataset with PCA:", train_accuracy)

#Calculating sihouette Score
score = metrics.silhouette_score(X_test, y_clus_test)
print("Sihouette Score: ",score)   #ranges from -1 to +1, high value shows that it is matched more


#First scale the data Applies the fit_transform() method of the StandardScaler instance to the feature matrix X to perform feature scaling. 
#This method first computes the mean and standard deviation of each feature in X, and then scales the features such that they have zero mean and unit variance
#Then apply PCA to reduce the dimensionality to 3 components. 
#Then split the data into training and testing sets using the train_test_split() function. 
#Perform K-means clustering on the training set and test set and predict the cluster for each training data point. 
#Finally, evaluate the performance of the clustering on the training & training set using classification_report(), confusion_matrix(), accuracy_score(), and silhouette_score() functions from sklearn.metrics.


              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           6       1.00      0.00      0.00      65.0
           7       1.00      0.00      0.00      55.0
           8       1.00      0.00      0.00      68.0
           9       1.00      0.00      0.00      57.0
          10       1.00      0.00      0.00      85.0
          11       1.00      0.00      0.00     103.0
          12       1.00      0.00      0.00    2610.0

    accuracy                           0.00    3043.0
   macro avg       0.70      0.30      0.00    3043.0
weighted avg       1.00      0.00      0.00    3043.0

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [  41    3   21    0    0    0    0    0    0    0]
 [  43    0   12    0    0   

Question-2: Use pd_speech_features.csv


In [18]:

df_pd = pd.read_csv("/content/pd_speech_features.csv")
df_pd.head()

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,...,1.562,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.178,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.846,6.265,4.0603,1
4,1,0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1


In [19]:
df_pd.isnull().any()

id                           False
gender                       False
PPE                          False
DFA                          False
RPDE                         False
                             ...  
tqwt_kurtosisValue_dec_33    False
tqwt_kurtosisValue_dec_34    False
tqwt_kurtosisValue_dec_35    False
tqwt_kurtosisValue_dec_36    False
class                        False
Length: 755, dtype: bool

In [20]:
X = df_pd.drop('class',axis=1).values
Y = df_pd['class'].values

# this codes represents dropping the target variable class from main data frame and creates a new data fram X
# Y returns the class column from the main data frame 

In [21]:
# a. Perform Scaling

#Scaling Data
scaler = StandardScaler()
X_Scale = scaler.fit_transform(X)

#StandardScaler to scale the input X, this is important as it ensures that all the features are on the same scale and prevents features with larger magnitude from dominating the distance calculations
#Applies the fit_transform() method of the StandardScaler instance to the feature matrix X to perform feature scaling


In [22]:
# b. Apply PCA (k=3)

# Apply PCA with k =3
pca3 = PCA(n_components=3)
principalComponents = pca3.fit_transform(X_Scale)

principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2','Principal Component 3'])

finalDf = pd.concat([principalDf, df_pd[['class']]], axis = 1)
finalDf.head()


Unnamed: 0,principal component 1,principal component 2,Principal Component 3,class
0,-10.047373,1.471078,-6.8464,1
1,-10.637725,1.583749,-6.830978,1
2,-13.516186,-1.25354,-6.818694,1
3,-9.155085,8.833601,15.290903,1
4,-6.76447,4.611466,15.637119,1


In [23]:
X = finalDf.drop('class',axis=1).values
Y = finalDf['class'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=0)


In [24]:
# c. Use SVM to report performance

from sklearn.svm import SVC

svmClassifier = SVC()
svmClassifier.fit(X_train, Y_train)

y_pred = svmClassifier.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(Y_test, y_pred, zero_division=1))
print(confusion_matrix(Y_test, y_pred))

# Accuracy score
glass_acc_svc = accuracy_score(y_pred,Y_test)
print('accuracy is',glass_acc_svc)

#Calculate sihouette Score
score = metrics.silhouette_score(X_test, y_pred)
print("Sihouette Score: ",score) 

#It then trains an SVM classifier on the training set, predicts the classes for the test set using the trained classifier, and evaluates the performance using a classification report, confusion matrix, accuracy score, and silhouette score.


              precision    recall  f1-score   support

           0       0.67      0.42      0.52        57
           1       0.83      0.93      0.88       170

    accuracy                           0.80       227
   macro avg       0.75      0.68      0.70       227
weighted avg       0.79      0.80      0.79       227

[[ 24  33]
 [ 12 158]]
accuracy is 0.801762114537445
Sihouette Score:  0.254320903960664


Question-3: Apply Linear Discriminant Analysis (LDA) on Iris.csv dataset to reduce dimensionality of data to k=2. 



A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.


In [25]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
df_iris = pd.read_csv("/content/Iris.csv")
df_iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [26]:
df_iris.isnull().any()

Id               False
SepalLengthCm    False
SepalWidthCm     False
PetalLengthCm    False
PetalWidthCm     False
Species          False
dtype: bool

In [28]:
x = df_iris.iloc[:,1:-1]
y = df_iris.iloc[:,-1]
print(x.shape,y.shape)


(150, 4) (150,)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


In [30]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
le = LabelEncoder()
y = le.fit_transform(y)


In [31]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
print(X_train.shape,X_test.shape)

#fit and transform the scaler object on our training data and only transform our test data.
#LabelEncoder to encode our target variable y into numerical values.
#(LDA) to perform dimensionality reduction on our input features x. Here, we are reducing the number of input features to 2 using n_components=2
#we transform our training and test data using the fit_transform and transform methods of the LDA object respectively


(105, 2) (45, 2)


Question-4: Briefly identify the difference between PCA and LDA



PCA (Principal Component Analysis) and LDA (Linear Discriminant Analysis) are both popular techniques in machine learning for dimensionality reduction. However, they have different purposes and methods:

Purpose:
PCA is used for unsupervised learning and finds the directions of maximum variance in a dataset. It reduces the number of features by transforming the original dataset into a new coordinate system, where the features are uncorrelated and sorted by their variance. PCA is commonly used for data compression, visualization, and noise reduction.
LDA, on the other hand, is used for supervised learning and aims to find the linear combinations of features that best separate the classes. It reduces the number of features by projecting the original dataset onto a lower-dimensional space while maximizing the class separability. LDA is commonly used for feature extraction, pattern recognition, and classification.

Method:
PCA operates by finding the eigenvectors and eigenvalues of the covariance matrix of the data. The eigenvectors represent the directions of maximum variance, and the eigenvalues represent the amount of variance explained by each eigenvector. PCA selects the top k eigenvectors, where k is the desired dimensionality of the reduced dataset.
LDA, on the other hand, maximizes the between-class scatter and minimizes the within-class scatter of the data. It involves finding the eigenvectors and eigenvalues of the product of two matrices: the between-class scatter matrix and the within-class scatter matrix. LDA selects the top k eigenvectors that correspond to the largest eigenvalues.
