In [1]:
# importing the required libraries
import csv
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import LocalOutlierFactor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Function to remove the outliers using the LOF Score method
def removeOutliers(X, y):
    # normalizing the data for better accuracy
    scaler = StandardScaler()
    X_new = scaler.fit_transform(X)

    # detecing the outliers using n-neighbors as 10
    clf = LocalOutlierFactor(n_neighbors=10)
    temp = clf.fit_predict(X_new)
    outliers = np.where(temp == -1)[0]
    
    # removing all the detected outliers from X_train and y_train and returning them as Dataframes
    X_clean = np.delete(X, outliers, axis=0)
    X_clean_df = pd.DataFrame(X_clean, columns=X.columns)
    y_clean = np.delete(y, outliers, axis=0)
    y_clean_df = pd.DataFrame(y_clean, columns=y.columns)

    return X_clean_df, y_clean_df

In [6]:
# Function to calculate the K-Means Clustering of the data and add a column into the data
def kMeansClustering(X, data):
    # computing the clusters using 20 classes
    kmeans = KMeans(n_clusters=20, random_state=0).fit(X)
    data['cluster'] = kmeans.labels_

In [8]:
# Function to compute the PCA of the given data
def computePCA(X, num):
    # normalizing the data for better accuracy
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)

    # reducing the dimensions of the data according to the given parameter
    pca = PCA(n_components=num)
    PCAx = pd.DataFrame(pca.fit_transform(X_train))
    return PCAx

In [9]:
# Function to compute the LDA of the given data
def computeLDA(X_train, y_train, toBeComputed):
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    xLDA = lda.transform(toBeComputed)

    return xLDA

In [13]:
# Funtion to apply the Random Forest Classification Algorithm to predict the data
def applyRandomForestClassifier(X_train, y_train, X_test, num):
    y_train = (y_train.to_numpy()).ravel()
    
    # training the model according to the given value of the hyperparameter
    rfc = RandomForestClassifier(n_estimators=num)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)

    # returning the trained model as well, it will be use for K-Fold Cross Validation
    return rfc, y_pred

In [14]:
# Function to perform the K-Fold Cross Validation on the trained model
def kFoldCrossValidation(model, X, y, k):
    kf = KFold(n_splits=k, shuffle=True)
    scores = cross_val_score(model, X, y, cv=kf)
    print("Accuracy: %0.2f" % (scores.mean()))

In [16]:
# Function to save the predicted values as a .csv file
def saveToCSV(y, filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['ID', 'category'])
        for i, value in enumerate(y):
            writer.writerow([i, value])

In [17]:
# Data Pre-processing

dataTrain = pd.read_csv("D:\\IIIT Delhi\\4th Semester\\Courses\\Statistical Machine Learning\\Project\\data\\train.csv")  # reading train.csv
dataTrain = dataTrain.dropna()  # deleting the rows with missing values 
dataTrain = dataTrain.drop(['ID'], axis=1)  # droping the ID column from the data
X_train = dataTrain.drop(['category'], axis=1)  # dropping the output column to form X_train
y_train = pd.DataFrame(dataTrain.iloc[:,-1].values)  # taking the output column to form y_train

dataTest = pd.read_csv("D:\\IIIT Delhi\\4th Semester\\Courses\\Statistical Machine Learning\\Project\\data\\test.csv")  # reading test.csv
dataTest = dataTest.dropna()  # deleting the rows with missing values
X_test = dataTest.drop(['ID'], axis=1)  # droping the ID column to form X_test

In [20]:
# forming the clusters of the training data and adding a new feauture in the dataset
kMeansClustering(X_train, dataTrain)
print(dataTrain)

            n0        n1        n2        n3        n4        n5        n6   
0     0.000000  0.000000  1.272801  0.290501  0.581446  0.000000  0.000000  \
1     0.000000  0.000000  1.542096  0.000000  0.896557  0.049978  0.000000   
2     0.000000  0.000000  1.098595  0.571866  0.500355  0.000000  0.000000   
3     0.000000  0.101666  1.159194  0.599216  0.893206  0.000000  0.200139   
4     0.000000  0.000000  1.178603  0.362568  0.577602  0.000000  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
1211  0.000000  0.364963  0.770978  0.570945  0.996824  0.066661  0.000000   
1212  0.083656  0.111407  1.753287  0.000000  1.197256  0.117919  0.000000   
1213  0.000000  0.000000  1.545725  0.000000  0.842485  0.000000  0.000000   
1214  0.000000  0.000000  1.194037  0.969926  0.499340  0.055789  0.000000   
1215  0.000000  0.000000  1.020402  1.107121  0.361440  0.000000  0.000000   

      n7        n8   n9  ...     n4088     n4089     n4090     

In [21]:
# cleaning the data by removing the outliers
X_clean, y_clean = removeOutliers(X_train, y_train)  # number of outliers computed = 86

In [22]:
# merging the X_train and X_test data for computation of PCA
t = [X_clean, X_test]
data = pd.concat(t)

# reducing the data into 600 columns
xPCA = computePCA(data, 600)

# dividing the X_train and X_test which were merged earlier 
X_train = xPCA.iloc[:1130, :]  # total rows = 1216, outliers = 86, 1216 - 86 = 1130
X_test = xPCA.iloc[1130:, :]

In [25]:
# computing the LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_clean)

# applying the same LDA model to both X_train and X_test
xLDA = lda.transform(X_train)  # consist n-1 cloumns where n = classes i.e. 20, thus it has 19 columns
x_testLDA = lda.transform(X_test)

In [26]:
# applying the Random Forest Classification Technique to find the predicted value 
model, y_pred = applyRandomForestClassifier(xLDA, y_clean, x_testLDA, 100)  # taking value of the hyperparameter = 100

In [27]:
# using K-Fold Cross Validation to check our model and printing the accuracy
kFoldCrossValidation(model, xLDA, y_clean, 5)

Accuracy: 0.99


In [29]:
# saving the predicted values in .csv file
saveToCSV(y_pred, "D:\\IIIT Delhi\\4th Semester\Courses\\Statistical Machine Learning\\Project\\y.csv")